参考文章:pytorch单机多卡并行训练
import os
import numpy as np
import sys
import logging
import time
import argparse
# tensorboard
from tensorboardX import SummaryWriter
# torch
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from loss import SimpleLoss, DiscriminativeLoss
# distributed
import torch.nn.parallel
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.backends.cudnn as cudnn
import torch.utils.data
import torch.utils.data.distributed
# utils
# ###########################
os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1, 2"
parser = argparse.ArgumentParser(description='PyTorch Training')
# logging config
parser.add_argument("--logdir", type=str, default='./runs_seg')
parser.add_argument("--log_iter", type=int, default=10)
parser.add_argument("--vis_log_iter", type=int, default=500)
# dataset config
parser.add_argument('--dataroot', type=str, default='/home/innox/Dataset/nuscenes-mini/mini')
parser.add_argument('--version', type=str, default='v1.0-mini', choices=['v1.0-trainval', 'v1.0-mini'])
# model config
# parser.add_argument("--model", type=str, default='HDMapNet_cam')
parser.add_argument("--model", type=str, default='HDMapNet_DeformableAttn')
# training config
parser.add_argument("--nepochs", type=int, default=30)
parser.add_argument("--eval_epoch", type=int, default=5)
parser.add_argument("--max_grad_norm", type=float, default=5.0)
parser.add_argument("--pos_weight", type=float, default=2.13)
parser.add_argument("--bsz", type=int, default=4,
help="batch size in per gpu")
parser.add_argument("--nworkers", type=int, default=4)
parser.add_argument("--lr", type=float, default=1e-3)
parser.add_argument("--weight_decay", type=float, default=1e-6)
# finetune config
parser.add_argument('--finetune', action='store_true', default=False)
parser.add_argument('--modelf', type=str, default="./model29.pt")
parser.add_argument('--resume', type=str, default=None)
s
# distributed training config
parser.add_argument('--multi_gpus', default=[0, 1, 2, 3])
parser.add_argument('--gpu_nums', default=1, type=int)
parser.add_argument('--rank', default=0, type=int, help='node rank for distributed training')
parser.add_argument('--ngpus_per_node', default=3, type=int)
parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
best_prec1 = 0
iter = 0
best_iou = 0
total_iter = len(train_loader) * args.nepochs
eval_time = 0
total_time = 0
last_idx = len(train_loader)
def main():
print('Part1 : prepare for parameters <==> Begin')
args = parser.parse_args()
ngpus_per_node = args.ngpus_per_node
print('ngpus_per_node:', ngpus_per_node)
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
def train(args, train_loader, model, criterion, optimizer, epoch, writer):
global iter, best_iou, total_time
# switch to train mode
model.train()
for batch_iter, (imgs, trans, rots, intrins, post_trans, post_rots, car_trans,yaw_pitch_roll,
semantic_gt, instance_gt, direction_gt) in enumerate(train_loader):
batch_iter += 1
iter += 1
t0 = time.time()
optimizer.zero_grad()
semantic, embedding, direction = model(imgs.cuda(non_blocking=True), trans.cuda(), rots.cuda(), intrins.cuda(),
post_trans.cuda(), post_rots.cuda(), car_trans.cuda(), yaw_pitch_roll.cuda())
semantic_gt = semantic_gt.cuda(non_blocking=True).float()
# caculate loss
seg_loss = criterion(semantic, semantic_gt)
final_loss = seg_loss
final_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
optimizer.step()
total_time += (time.time()-t0)
# log
if args.rank % args.ngpus_per_node == 0 and batch_iter % args.log_iter == 0:
# log
pass
def validate(args, val_loader, model, epoch, ngpus_per_node):
model.eval()
iou = eval_iou(model, val_loader)
# log
if args.rank % ngpus_per_node == 0:
# log
pass
return iou
def main_worker(gpu, ngpus_per_node, args):
global best_prec1
# init log
if not os.path.exists(args.logdir):
os.mkdir(args.logdir)
logging.basicConfig(filename=os.path.join(args.logdir, "results.log"),
filemode='w',
format='%(asctime)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO)
logging.getLogger('shapely.geos').setLevel(logging.CRITICAL)
logger = logging.getLogger()
logger.addHandler(logging.StreamHandler(sys.stdout))
# SummaryWriter
writer = SummaryWriter(logdir=args.logdir)
# init Distributed
args.gpu = gpu
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', world_size=ngpus_per_node, rank=gpu)
print('rank', args.rank, ' use multi-gpus...')
if args.rank % ngpus_per_node == 0:
print('Part1 : prepare for parameters <==> Done')
print('Part2 : Load Network <==> Begin')
# model
model = get_model(args.model, data_conf, args.instance_seg, args.embedding_dim, args.direction_pred, args.angle_class)
if args.finetune:
logger.info("Train type: Finetune")
model.load_state_dict(torch.load(args.modelf), strict=False)
# optimizer
opt = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
sched = StepLR(opt, 10, 0.1)
# loss
loss_fn = SimpleLoss(args.pos_weight).cuda()
if args.instance_seg:
embedded_loss_fn = DiscriminativeLoss(args.embedding_dim, args.delta_v, args.delta_d).cuda()
if args.direction_pred:
direction_loss_fn = torch.nn.BCELoss(reduction='none')
# cudnn
cudnn.benchmark = True
# DistributedDataParallel
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
args.batch_size = int(args.batch_size / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
else:
model.cuda()
model = torch.nn.parallel.DistributedDataParallel(model)
if args.resume:
pass
if args.rank % ngpus_per_node == 0:
print('Part2 : Load Network <==> Done')
print('Part3 : Load Dataset <==> Begin')
# dataloader
train_loader, val_loader = semantic_dataset(args.version, args.dataroot, data_conf, args.bsz, args.nworkers, distributed=args.distributed)
if args.rank % ngpus_per_node == 0:
print('Part3 : Load Dataset <==> Done')
print('Part4 : Train and Test <==> Begin')
for epoch in range(args.start_epochs, args.epochs):
# train for one epoch
train(args, train_loader, model, loss_fn, opt, epoch, writer)
# evaluate on validation set
if epoch % args.eval_epoch == 0:
prec1 = validate(args, val_loader, model, epoch, ngpus_per_node)
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
if args.rank % ngpus_per_node == 0:
if not is_best:
print('Top1 Accuracy stay with {:.3f}'.format(best_prec1))
else:
# save the best model
pass
sched.step()
print('Part4 : Train and Test <==> Done')
if __name__ == '__main__':
main()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)