DeepSpeed

安装DeepSpeed

git clone https://github.com/microsoft/DeepSpeed.git
cd DeepSpeed
git checkout v0.9.5

TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6;8.9" \
DS_BUILD_CCL_COMM=1 \
DS_BUILD_CPU_ADAM=1 \
DS_BUILD_CPU_ADAGRAD=1 \
DS_BUILD_FUSED_ADAM=1 \
DS_BUILD_FUSED_LAMB=1 \
DS_BUILD_UTILS=1 \
python setup.py build_ext -j24 bdist_wheel

# 进入DeepSpeed目录下的dist目录, 安装whl文件
pip install *.whl

更多配置信息可见于OvJat/DeepSpeedTutorial: DeepSpeed Tutorial (github.com)

训练的模型不止一个时 (此处以stable diffusion为例), an example:

cli/jobs/deepspeed/deepspeed-training/src

安装mpi4py

sudo apt update sudo apt-get install libopenmpi-dev

pip install mpi4py

hf与deepspeed的配置冲突, 将deepspeed的相应配置修改为”auto”

"bf16": { "enabled": "auto" }

使用deepspeed需要安装bitsandbytes

pip install bitsandbytes

多卡运行

CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 --master_port=29500 finetune.py

运行时间: 03:33<12:41:38

单卡运行

CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --master_port=29500 finetune.py

运行时间: 03:38<46:53:33

如何调用

# 单GPU的使用方法
deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py ...
# 单GPU并指定对应的GPU
deepspeed --include localhost:1 examples/pytorch/translation/run_translation.py ...
# 单GPU使用DeepSpeed的优势:
# 使用ZeRO-offload, 将部分数据offload到CPU，降低对显存的需求
# 提供对显存的管理, 减少显存的碎片

# 多GPU的使用方法1
torch.distributed.run --nproc_per_node=2 your_program.py <normal cl args> --deepspeed ds_config.json
# 多GPU的使用方法2
deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json

# 多节点多卡方法1，需要在多个节点上手动启动
python -m torch.distributed.run --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 --master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json
# 多节点多卡方法2，需要创建一个 hostfile 文件，只需在一个节点上启动
hostname1 slots=8  # 表示该机器有8个GPU用于训练
hostname2 slots=8
# 然后运行
deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json

传递参数

TrainingArguments(..., deepspeed="config/ds_config.json")
# or
ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
TrainingArguments(..., deepspeed=ds_config_dict)

ZeRO-stage-0配置示例

# 禁用所有的分片, 把DeepSpeed当作DDP来使用
{
    "zero_optimization": {
        "stage": 0
    }
}

ZeRO-stage-1配置示例

{
    "zero_optimization": {
        "stage": 1
    }
}

ZeRO-stage-2配置示例

{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

ZeRO-stage-3配置示例

{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

// original
// {
//     "zero_optimization": {
//         "stage": 3,
//         "offload_optimizer": {
//             "device": "none",
//             "pin_memory": true
//         },
//         "offload_param": {
//             "device": "none",
//             "pin_memory": true
//         },
//         "overlap_comm": true,
//         "contiguous_gradients": true,
//         "sub_group_size": 1e9,
//         "reduce_bucket_size": "auto",
//         "stage3_prefetch_bucket_size": "auto",
//         "stage3_param_persistence_threshold": "auto",
//         "stage3_max_live_parameters": 1e9,
//         "stage3_max_reuse_distance": 1e9,
//         "stage3_gather_16bit_weights_on_model_save": true
//     },
//     "train_batch_size": "auto",
//     "train_micro_batch_size_per_gpu": "auto",
//     "gradient_accumulation_steps": "auto",
//     "bf16": {
//         "enabled": true
//     }
// }

inference

1	`deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json`

Accelerate

🤗Accelerate库的使用指南和案例 - 知乎 (zhihu.com)

#DDP

如何打包训练环境 (conda pack) Previous

Prompt Engineering Next