mmsegmentation框架解析(上)
# mmsegmentation 框架解析
按照官方的 tutorial 先过一遍
# Tutorial 1:配置文件
# 1、配置文件的基本知识
1、python tools/print_config.py /PATH/TO/CONFIG
可以用来可视化完整的config
2、配置文件可以通过 --options xxx.yyy=zzz
来更新 config
python tools/print_config.py mmsegmentation/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py
输出:
Config:
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='EncoderDecoder',
pretrained='open-mmlab://resnet18_v1c',
backbone=dict(
type='ResNetV1c',
depth=18,
num_stages=4,
out_indices=(0, 1, 2, 3),
dilations=(1, 1, 2, 4),
strides=(1, 2, 1, 1),
norm_cfg=dict(type='SyncBN', requires_grad=True),
norm_eval=False,
style='pytorch',
contract_dilation=True),
decode_head=dict(
type='DepthwiseSeparableASPPHead',
in_channels=512,
in_index=3,
channels=128,
dilations=(1, 12, 24, 36),
c1_in_channels=64,
c1_channels=12,
dropout_ratio=0.1,
num_classes=19,
norm_cfg=dict(type='SyncBN', requires_grad=True),
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
auxiliary_head=dict(
type='FCNHead',
in_channels=256,
in_index=2,
channels=64,
num_convs=1,
concat_input=False,
dropout_ratio=0.1,
num_classes=19,
norm_cfg=dict(type='SyncBN', requires_grad=True),
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
train_cfg=dict(),
test_cfg=dict(mode='whole'))
dataset_type = 'CityscapesDataset'
data_root = 'data/cityscapes/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (512, 1024)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 1024), cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size=(512, 1024), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 1024),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type='CityscapesDataset',
data_root='data/cityscapes/',
img_dir='leftImg8bit/train',
ann_dir='gtFine/train',
pipeline=[
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(
type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 1024), cat_max_ratio=0.75),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size=(512, 1024), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]),
val=dict(
type='CityscapesDataset',
data_root='data/cityscapes/',
img_dir='leftImg8bit/val',
ann_dir='gtFine/val',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 1024),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]),
test=dict(
type='CityscapesDataset',
data_root='data/cityscapes/',
img_dir='leftImg8bit/val',
ann_dir='gtFine/val',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 1024),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]))
log_config = dict(
interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict()
lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False)
runner = dict(type='IterBasedRunner', max_iters=80000)
checkpoint_config = dict(by_epoch=False, interval=8000)
evaluation = dict(interval=8000, metric='mIoU')
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# 2、配置文件的结构
有4个基础的配置文件,分别是dataset、model、schedule、default_runtime
- dataset 用于解析每个数据集
- model 用作配置每个模型的基类
- schedule 是对 lr 进行调整
- default_runtime 是训练时的 log 以及 tensorboard、以及 nccl 、cudnn_benchmark等处理
mmsegmentation 遵循这样的命名 {model}_{backbone}_[misc]_[gpu x batch_per_gpu]_{resolution}_{schedule}_{dataset}
{model}
: model type likepsp
,deeplabv3
, etc.{backbone}
: backbone type liker50
(ResNet-50),x101
(ResNeXt-101).[misc]
: miscellaneous setting/plugins of model, e.g.dconv
,gcb
,attention
,mstrain
.[gpu x batch_per_gpu]
: GPUs and samples per GPU,8x2
is used by default.{schedule}
: training schedule,20ki
means 20k iterations.{dataset}
: dataset likecityscapes
,voc12aug
,ade
.
mmsegmentation 建议使用者对基类进行继承,如果需要开发全新的模型,就重写一个 model 基类
例如这是 deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py 的内容
_base_ = [ '../_base_/models/deeplabv3plus_r50-d8.py', '../_base_/datasets/cityscapes.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py' ]
1
2
3
4
5
这是 deeplabv3plus_r101-d8_512x1024_40k_cityscapes.py 的内容
_base_ = './deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py' model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
1
2
可见 r50-d8 的全部pipeline 都是继承于基类,也就是说基类就是定义了一个 deeplabv3plus_r50-d8。r101-d8 的配置文件直接继承 r50-d8 并把 backbone 修改下即可,这种通过配置文件定义整个 pipeline 的方式值得学习
接下来我们对 deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py 配置文件的内容做一个解析,其实也就是对相应的基类做解析。
# 2.1、deeplabv3plus 的 model 基类
# model settings
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='EncoderDecoder',
pretrained='open-mmlab://resnet50_v1c',
backbone=dict(
type='ResNetV1c', # backbone 的类型
depth=50, # backbone 的深度
num_stages=4, # backbone 的stage
out_indices=(0, 1, 2, 3), # 在每个阶段中生成的输出特征图的index。
dilations=(1, 1, 2, 4), # 每层的 dilation rate
strides=(1, 2, 1, 1), # 每层的 stride
norm_cfg=norm_cfg, # 配置归一化的方式,一般使用 SyncBN
norm_eval=False, # 在评估和测试的的时候是否打开 BN
style='pytorch',
contract_dilation=True),
decode_head=dict(
type='DepthwiseSeparableASPPHead', # decode head 的类型
in_channels=2048, # auxiliary head 的输入通道数
in_index=3, # 选择的特征图的 index
channels=512, # decode head 的通道数
dilations=(1, 12, 24, 36), # 空洞卷积的尺度
c1_in_channels=256, # c1 输入的通道数
c1_channels=48, # c1 输出的通道数
dropout_ratio=0.1, # dropout 的概率
num_classes=19, # 输出的类别数量
norm_cfg=norm_cfg, # 归一化的方式
align_corners=False, #
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), # decode head 的损失
auxiliary_head=dict(
type='FCNHead', # auxiliary head 的类型
in_channels=1024, # 输入的通道数
in_index=2, # 输入的特征图的 index
channels=256, # 输出的通道数
num_convs=1, # auxiliary head 卷积的数量
concat_input=False, # 在分类前是否 concat 输入以及卷积的输出
dropout_ratio=0.1, # dropout 的概率
num_classes=19, # 输出的类别数量
norm_cfg=norm_cfg, # 归一化的方式
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), # auxiliary head 的损失
# model training and testing settings
train_cfg=dict(), # 这里仅仅是占位符
test_cfg=dict(mode='whole')) # 测试的方式, 'whole' 和 'sliding','whole': 整张图测试, 'sliding': 以滑窗的方式测试
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# 2.2、cityscapes 的 dataset 基类
# dataset settings
dataset_type = 'CityscapesDataset' # 定义数据集的类型
data_root = 'data/cityscapes/' # 数据集的根目录
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True) # 图像的均值以及方差,用作归一化
crop_size = (512, 1024) # resize 的尺寸
train_pipeline = [ # 训练的 pipeline
dict(type='LoadImageFromFile'), # 首先从给定的文件路径读取图像
dict(type='LoadAnnotations'), # 第二步是加载标注
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), # 第三步 Resize
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), # 第四步 随机裁剪
dict(type='RandomFlip', prob=0.5), # 第五步 随机翻转
dict(type='PhotoMetricDistortion'), # 第六步 PhotoMetricDistortion 数据增强
dict(type='Normalize', **img_norm_cfg), # 第七步 归一化
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), # 第八步 crop 图像
dict(type='DefaultFormatBundle'), # 第九步
dict(type='Collect', keys=['img', 'gt_semantic_seg']), # collect keys里面的数据
]
test_pipeline = [ # 测试的 pipeline
dict(type='LoadImageFromFile'), # 首先从给定的文件路径读取图像
dict(
type='MultiScaleFlipAug', # 多尺度翻转增强
img_scale=(2048, 1024), # 图像尺度
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], # 是否多尺度测试
flip=False, # 是否翻转
transforms=[
dict(type='Resize', keep_ratio=True), # Resize
dict(type='RandomFlip'), # 随机翻转
dict(type='Normalize', **img_norm_cfg), # 归一化
dict(type='ImageToTensor', keys=['img']), # 图像转Tensor
dict(type='Collect', keys=['img']), #
])
]
data = dict(
samples_per_gpu=2, # 每个 gpu 的
workers_per_gpu=2, # 每个 gpu 的 worker
train=dict(
type=dataset_type, # 数据集类型
data_root=data_root, # 数据根目录
img_dir='leftImg8bit/train', # 训练图像文件的目录
ann_dir='gtFine/train', # 训练标注文件的目录
pipeline=train_pipeline), # train_pipeline
val=dict(
type=dataset_type, # 数据集类型
data_root=data_root, # 数据根目录
img_dir='leftImg8bit/val', # 验证图像文件的目录
ann_dir='gtFine/val', # 验证标注文件的目录
pipeline=test_pipeline), # test_pipeline
test=dict(
type=dataset_type, # 数据集类型
data_root=data_root, # 数据根目录
img_dir='leftImg8bit/val', # 测试图像文件的目录
ann_dir='gtFine/val', # 测试标注文件的目录
pipeline=test_pipeline)) # test_pipeline
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# 2.3、deeplabv3plus 的 schedules 基类
# optimizer
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) # 优化器的配置
optimizer_config = dict()
# learning policy
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) # 学习率的配置
# runtime settings
runner = dict(type='IterBasedRunner', max_iters=40000)
checkpoint_config = dict(by_epoch=False, interval=4000)
evaluation = dict(interval=4000, metric='mIoU') # 评估间隔和metric的设置
2
3
4
5
6
7
8
9
10
# 2.4、deeplabv3plus 的 default_runtime 基类
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook', by_epoch=False), # TextLogger
# dict(type='TensorboardLoggerHook') # TensorboardLogger
])
# yapf:enable
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Tutorial 2:自定义数据集
# 1、通过调整数据格式来自定义数据集(最简易的方式)
├── data
│ ├── my_dataset
│ │ ├── img_dir
│ │ │ ├── train
│ │ │ │ ├── xxx{img_suffix}
│ │ │ │ ├── yyy{img_suffix}
│ │ │ │ ├── zzz{img_suffix}
│ │ │ ├── val
│ │ ├── ann_dir
│ │ │ ├── train
│ │ │ │ ├── xxx{seg_map_suffix}
│ │ │ │ ├── yyy{seg_map_suffix}
│ │ │ │ ├── zzz{seg_map_suffix}
│ │ │ ├── val
2
3
4
5
6
7
8
9
10
11
12
13
14
- 一个训练对有相同的后缀 suffix
- 标注文件是(H, W)的图像,像素值的取值范围应当是【0,num_classes - 1】,可以通过 'P' 模式使用pillow 加载彩色标注图像
# 2、通过混合数据集来自定义数据集
mmsegmentation 也支持混合数据集用于训练
# 2.1、重复数据集
dataset_A_train = dict(
type='RepeatDataset',
times=N,
dataset=dict( # This is the original config of Dataset_A
type='Dataset_A',
...
pipeline=train_pipeline
)
)
2
3
4
5
6
7
8
9
# 2.2、连接数据集
可以连接两个标注目录
dataset_A_train = dict(
type='Dataset_A',
img_dir = 'img_dir',
ann_dir = ['anno_dir_1', 'anno_dir_2'],
pipeline=train_pipeline
)
2
3
4
5
6
可以连接两个数据集划分
dataset_A_train = dict(
type='Dataset_A',
img_dir = 'img_dir',
ann_dir = 'anno_dir',
split = ['split_1.txt', 'split_2.txt'],
pipeline=train_pipeline
)
2
3
4
5
6
7
也可以同时连接两个标注目录以及数据集划分,anno_dir_1 对应 split_1.txt,anno_dir_2对应 split_2.txt,
dataset_A_train = dict(
type='Dataset_A',
img_dir = 'img_dir',
ann_dir = ['anno_dir_1', 'anno_dir_2'],
split = ['split_1.txt', 'split_2.txt'],
pipeline=train_pipeline
)
2
3
4
5
6
7
# 参考资料
- https://mmsegmentation.readthedocs.io/en/latest/tutorials/
- 02
- README 美化05-20
- 03
- 常见 Tricks 代码片段05-12