DeepSpeed

  • 安装DeepSpeed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
git clone https://github.com/microsoft/DeepSpeed.git
cd DeepSpeed
git checkout v0.9.5

TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6;8.9" \
DS_BUILD_CCL_COMM=1 \
DS_BUILD_CPU_ADAM=1 \
DS_BUILD_CPU_ADAGRAD=1 \
DS_BUILD_FUSED_ADAM=1 \
DS_BUILD_FUSED_LAMB=1 \
DS_BUILD_UTILS=1 \
python setup.py build_ext -j24 bdist_wheel

# 进入DeepSpeed目录下的dist目录, 安装whl文件
pip install *.whl

更多配置信息可见于OvJat/DeepSpeedTutorial: DeepSpeed Tutorial (github.com)

  • 训练的模型不止一个时 (此处以stable diffusion为例), an example:

cli/jobs/deepspeed/deepspeed-training/src

  • 安装mpi4py

sudo apt update sudo apt-get install libopenmpi-dev

pip install mpi4py

  • hf与deepspeed的配置冲突, 将deepspeed的相应配置修改为”auto”

"bf16": { "enabled": "auto" }

  • 使用deepspeed需要安装bitsandbytes

pip install bitsandbytes

  • 多卡运行

CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 --master_port=29500 finetune.py

运行时间: 03:33<12:41:38

  • 单卡运行

CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --master_port=29500 finetune.py

运行时间: 03:38<46:53:33

  • 如何调用
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 单GPU的使用方法
deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py ...
# 单GPU并指定对应的GPU
deepspeed --include localhost:1 examples/pytorch/translation/run_translation.py ...
# 单GPU使用DeepSpeed的优势:
# 使用ZeRO-offload, 将部分数据offload到CPU,降低对显存的需求
# 提供对显存的管理, 减少显存的碎片

# 多GPU的使用方法1
torch.distributed.run --nproc_per_node=2 your_program.py <normal cl args> --deepspeed ds_config.json
# 多GPU的使用方法2
deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json

# 多节点多卡方法1,需要在多个节点上手动启动
python -m torch.distributed.run --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 --master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json
# 多节点多卡方法2,需要创建一个 hostfile 文件,只需在一个节点上启动
hostname1 slots=8 # 表示该机器有8个GPU用于训练
hostname2 slots=8
# 然后运行
deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json
  • 传递参数
1
2
3
4
TrainingArguments(..., deepspeed="config/ds_config.json")
# or
ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
TrainingArguments(..., deepspeed=ds_config_dict)
  • ZeRO-stage-0配置示例
1
2
3
4
5
6
# 禁用所有的分片, 把DeepSpeed当作DDP来使用
{
"zero_optimization": {
"stage": 0
}
}
  • ZeRO-stage-1配置示例
1
2
3
4
5
{
"zero_optimization": {
"stage": 1
}
}
  • ZeRO-stage-2配置示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},

"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},

"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},

"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true
},

"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
  • ZeRO-stage-3配置示例
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},

"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},

"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},

"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},

"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}

// original
// {
// "zero_optimization": {
// "stage": 3,
// "offload_optimizer": {
// "device": "none",
// "pin_memory": true
// },
// "offload_param": {
// "device": "none",
// "pin_memory": true
// },
// "overlap_comm": true,
// "contiguous_gradients": true,
// "sub_group_size": 1e9,
// "reduce_bucket_size": "auto",
// "stage3_prefetch_bucket_size": "auto",
// "stage3_param_persistence_threshold": "auto",
// "stage3_max_live_parameters": 1e9,
// "stage3_max_reuse_distance": 1e9,
// "stage3_gather_16bit_weights_on_model_save": true
// },
// "train_batch_size": "auto",
// "train_micro_batch_size_per_gpu": "auto",
// "gradient_accumulation_steps": "auto",
// "bf16": {
// "enabled": true
// }
// }
  • inference
1
deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json
  • Accelerate

🤗Accelerate库的使用指南和案例 - 知乎 (zhihu.com)