很多人问这个问题,
其实主要就是把分布式计算的stuff改一下就好了
bevfusion采用torchpack这个很难用的包(其实也还好?hhh)来进行分布式计算
我们在单显卡上之需要改这一部分就好
- tool/train:
import argparse
import copy
import os
import random
import time
import numpy as np
import torch
from mmcv import Config
from torchpack import distributed as dist
from torchpack.environ import auto_set_run_dir, set_run_dir
from torchpack.utils.config import configs
from mmdet3d.apis import train_model
from mmdet3d.datasets import build_dataset
from mmdet3d.models import build_model
from mmdet3d.utils import get_root_logger, convert_sync_batchnorm, recursive_eval
import sys
# sys.argv = ['tools/train.py', 'configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/convfuser.yaml',# '--run-dir', 'Res/test_mini'# ]# sys.argv = ['tools/train.py', 'configs/once/det/transfusion/secfpn/camera+lidar/swint_v0p075/convfuser.yaml',# '--run-dir', 'Res/test_once_mini'# ]#python tools/train.py configs/once/det/transfusion/secfpn/camera+lidar/swint_v0p075/convfuser.yaml --run-dir Res/test_once_mini_6camdefmain():# dist.init()
parser = argparse.ArgumentParser()
parser.add_argument("config", metavar="FILE",help="config file")
parser.add_argument("--run-dir", metavar="DIR",help="run directory")
args, opts = parser.parse_known_args()
configs.load(args.config, recursive=True)
configs.update(opts)
cfg = Config(recursive_eval(configs), filename=args.config)
torch.backends.cudnn.benchmark = cfg.cudnn_benchmark
torch.cuda.set_device(0)if args.run_dir isNone:
args.run_dir = auto_set_run_dir()else:
set_run_dir(args.run_dir)
cfg.run_dir = args.run_dir
# dump config
cfg.dump(os.path.join(cfg.run_dir,"configs.yaml"))# init the logger before other steps
timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
log_file = os.path.join(cfg.run_dir,f"{timestamp}.log")
logger = get_root_logger(log_file=log_file)# log some basic info
logger.info(f"Config:\n{cfg.pretty_text}")# set random seedsif cfg.seed isnotNone:
logger.info(f"Set random seed to {cfg.seed}, "f"deterministic mode: {cfg.deterministic}")
random.seed(cfg.seed)
np.random.seed(cfg.seed)
torch.manual_seed(cfg.seed)if cfg.deterministic:
torch.backends.cudnn.deterministic =True
torch.backends.cudnn.benchmark =False
datasets =[build_dataset(cfg.data.train)]
model = build_model(cfg.model,)
model.init_weights()if cfg.get("sync_bn",None):ifnotisinstance(cfg["sync_bn"],dict):
cfg["sync_bn"]=dict(exclude=[])
model = convert_sync_batchnorm(model, exclude=cfg["sync_bn"]["exclude"])
logger.info(f"Model:\n{model}")
train_model(
model,
datasets,
cfg,
distributed=False,
validate=True,
timestamp=timestamp,)if __name__ =="__main__":
main()
- mmdet3d/apis/train.py
import torch
from mmcv.parallel import MMDistributedDataParallel,MMDataParallel
from mmcv.runner import(
DistSamplerSeedHook,
EpochBasedRunner,
GradientCumulativeFp16OptimizerHook,
Fp16OptimizerHook,
OptimizerHook,
build_optimizer,
build_runner,)from mmdet3d.runner import CustomEpochBasedRunner
from mmdet3d.utils import get_root_logger
from mmdet.core import DistEvalHook, EvalHook
from mmdet.datasets import build_dataloader, build_dataset, replace_ImageToTensor
deftrain_model(
model,
dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,):
logger = get_root_logger()# prepare data loaders
dataset = dataset ifisinstance(dataset,(list,tuple))else[dataset]
data_loaders =[
build_dataloader(
ds,
cfg.data.samples_per_gpu,
cfg.data.workers_per_gpu,
num_gpus=1,
dist=distributed,
seed=cfg.seed,)for ds in dataset
]# put model on gpus
find_unused_parameters = cfg.get("find_unused_parameters",False)# Sets the `find_unused_parameters` parameter in# torch.nn.parallel.DistributedDataParallelif distributed:
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters,)else:
model = MMDataParallel(
model.cuda(),
device_ids=[0],)# build runner
optimizer = build_optimizer(model, cfg.optimizer)
runner = build_runner(
cfg.runner,
default_args=dict(
model=model,
optimizer=optimizer,
work_dir=cfg.run_dir,
logger=logger,
meta={},),)ifhasattr(runner,"set_dataset"):
runner.set_dataset(dataset)# an ugly workaround to make .log and .log.json filenames the same
runner.timestamp = timestamp
# fp16 setting
fp16_cfg = cfg.get("fp16",None)if fp16_cfg isnotNone:if"cumulative_iters"in cfg.optimizer_config:
optimizer_config = GradientCumulativeFp16OptimizerHook(**cfg.optimizer_config,**fp16_cfg, distributed=distributed
)else:
optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,**fp16_cfg, distributed=distributed
)elif distributed and"type"notin cfg.optimizer_config:
optimizer_config = OptimizerHook(**cfg.optimizer_config)else:
optimizer_config = cfg.optimizer_config
# register hooks
runner.register_training_hooks(
cfg.lr_config,
optimizer_config,
cfg.checkpoint_config,
cfg.log_config,
cfg.get("momentum_config",None),
custom_hooks_config=cfg.get('custom_hooks',None))ifisinstance(runner, EpochBasedRunner):
runner.register_hook(DistSamplerSeedHook())# register eval hooksif validate:# Support batch_size > 1 in validation
val_samples_per_gpu = cfg.data.val.pop("samples_per_gpu",1)if val_samples_per_gpu >1:# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.val.pipeline = replace_ImageToTensor(cfg.data.val.pipeline)
val_dataset = build_dataset(cfg.data.val,dict(test_mode=True))
val_dataloader = build_dataloader(
val_dataset,
samples_per_gpu=val_samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False,)
eval_cfg = cfg.get("evaluation",{})
eval_cfg["by_epoch"]= cfg.runner["type"]!="IterBasedRunner"
eval_hook = DistEvalHook if distributed else EvalHook
###主要是这一步
runner.register_hook(eval_hook(val_dataloader,**eval_cfg))if cfg.resume_from:
runner.resume(cfg.resume_from)elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders,[("train",1)])
- tools/test.py(如果要测试的话)
import argparse
import copy
import os
import warnings
import mmcv
import torch
from torchpack.utils.config import configs
from torchpack import distributed as dist
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import get_dist_info, init_dist, load_checkpoint, wrap_fp16_model
from mmdet3d.apis import single_gpu_test
from mmdet3d.datasets import build_dataloader, build_dataset
from mmdet3d.models import build_model
from mmdet.apis import multi_gpu_test, set_random_seed
from mmdet.datasets import replace_ImageToTensor
from mmdet3d.utils import recursive_eval
import sys
import os
import time
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'defparse_args():
parser = argparse.ArgumentParser(description="MMDet test (and eval) a model")
parser.add_argument("config",help="test config file path")
parser.add_argument("checkpoint",help="checkpoint file")
parser.add_argument("--out",help="output result file in pickle format")
parser.add_argument("--fuse-conv-bn",
action="store_true",help="Whether to fuse conv and bn, this will slightly increase""the inference speed",)
parser.add_argument("--format-only",
action="store_true",help="Format the output results without perform evaluation. It is""useful when you want to format the result to a specific format and ""submit it to the test server",)
parser.add_argument("--eval",type=str,
nargs="+",help='evaluation metrics, which depends on the dataset, e.g., "bbox",'' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC',)
parser.add_argument("--show", action="store_true",help="show results")
parser.add_argument("--show-dir",help="directory where results will be saved")
parser.add_argument("--gpu-collect",
action="store_true",help="whether to use gpu to collect results.",)
parser.add_argument("--tmpdir",help="tmp directory used for collecting results from multiple ""workers, available when gpu-collect is not specified",)
parser.add_argument("--seed",type=int, default=0,help="random seed")
parser.add_argument("--deterministic",
action="store_true",help="whether to set deterministic options for CUDNN backend.",)
parser.add_argument("--cfg-options",
nargs="+",
action=DictAction,help="override some settings in the used config, the key-value pair ""in xxx=yyy format will be merged into config file. If the value to "'be overwritten is a list, it should be like key="[a,b]" or key=a,b ''It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '"Note that the quotation marks are necessary and that no white space ""is allowed.",)
parser.add_argument("--options",
nargs="+",
action=DictAction,help="custom options for evaluation, the key-value pair in xxx=yyy ""format will be kwargs for dataset.evaluate() function (deprecate), ""change to --eval-options instead.",)
parser.add_argument("--eval-options",
nargs="+",
action=DictAction,help="custom options for evaluation, the key-value pair in xxx=yyy ""format will be kwargs for dataset.evaluate() function",)
parser.add_argument("--launcher",
choices=["none","pytorch","slurm","mpi"],
default="none",help="job launcher",)
parser.add_argument("--local_rank",type=int, default=0)
args = parser.parse_args()if"LOCAL_RANK"notin os.environ:
os.environ["LOCAL_RANK"]=str(args.local_rank)if args.options and args.eval_options:raise ValueError("--options and --eval-options cannot be both specified, ""--options is deprecated in favor of --eval-options")if args.options:
warnings.warn("--options is deprecated in favor of --eval-options")
args.eval_options = args.options
return args
defmain():
args = parse_args()# dist.init()
torch.backends.cudnn.benchmark =True# torch.cuda.set_device(dist.local_rank())assert args.out or args.evalor args.format_only or args.show or args.show_dir,("Please specify at least one operation (save/eval/format/show the "'results / save the results) with the argument "--out", "--eval"'', "--format-only", "--show" or "--show-dir"')if args.evaland args.format_only:raise ValueError("--eval and --format_only cannot be both specified")if args.out isnotNoneandnot args.out.endswith((".pkl",".pickle")):raise ValueError("The output file must be a pkl file.")
configs.load(args.config, recursive=True)
cfg = Config(recursive_eval(configs), filename=args.config)print(cfg)if args.cfg_options isnotNone:
cfg.merge_from_dict(args.cfg_options)# set cudnn_benchmarkif cfg.get("cudnn_benchmark",False):
torch.backends.cudnn.benchmark =True
cfg.model.pretrained =None# in case the test dataset is concatenated
samples_per_gpu =1ifisinstance(cfg.data.test,dict):
cfg.data.test.test_mode =True
samples_per_gpu = cfg.data.test.pop("samples_per_gpu",1)if samples_per_gpu >1:# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)elifisinstance(cfg.data.test,list):for ds_cfg in cfg.data.test:
ds_cfg.test_mode =True
samples_per_gpu =max([ds_cfg.pop("samples_per_gpu",1)for ds_cfg in cfg.data.test])if samples_per_gpu >1:for ds_cfg in cfg.data.test:
ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)# init distributed env first, since logger depends on the dist info.
distributed =False# distributed = True# set random seedsif args.seed isnotNone:
set_random_seed(args.seed, deterministic=args.deterministic)# build the dataloader
dataset = build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False,)# build the model and load checkpoint
cfg.model.train_cfg =None
model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"))
fp16_cfg = cfg.get("fp16",None)if fp16_cfg isnotNone:
wrap_fp16_model(model)
checkpoint = load_checkpoint(model, args.checkpoint, map_location="cpu")if args.fuse_conv_bn:
model = fuse_conv_bn(model)# old versions did not save class info in checkpoints, this walkaround is# for backward compatibilityif"CLASSES"in checkpoint.get("meta",{}):
model.CLASSES = checkpoint["meta"]["CLASSES"]else:
model.CLASSES = dataset.CLASSES
ifnot distributed:
model = MMDataParallel(model, device_ids=[0])
outputs, input_data= single_gpu_test(model, data_loader)# outputs = single_gpu_test(model, data_loader)else:
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,)
outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect)
rank, _ = get_dist_info()if rank ==0:if args.out:print(f"\nwriting results to {args.out}")
mmcv.dump(outputs, args.out)
kwargs ={}if args.eval_options isNoneelse args.eval_options
if args.format_only:
dataset.format_results(outputs,**kwargs)if args.eval:
eval_kwargs = cfg.get("evaluation",{}).copy()# hard-code way to remove EvalHook argsfor key in["interval","tmpdir","start","gpu_collect","save_best","rule",]:
eval_kwargs.pop(key,None)
eval_kwargs.update(dict(metric=args.eval,**kwargs))
input_data_files, tmp_dir = dataset.format_results(input_data,**kwargs)# eval_kwargs.updata(dict())print(dataset.evaluate(outputs,input_data_files = input_data_files,**eval_kwargs))# print(dataset.evaluate(outputs,**eval_kwargs))
tmp_dir.cleanup()if __name__ =="__main__":
main()
本文转载自: https://blog.csdn.net/ll594282475/article/details/127925826
版权归原作者 大头蘑菇汤 所有, 如有侵权,请联系我们删除。
版权归原作者 大头蘑菇汤 所有, 如有侵权,请联系我们删除。