# 程式碼-多節點微調模型-torchrun ## 多節點微調模型-torchrun ## :memo: Where do I start? - 聯絡窗口 Email us : 2303117@narlabs.org.tw 王小姐 ### 事前準備 - 映像檔下載 ``` singularity pull c00cjz00/c00cjz00_cuda11.8_pytorch:2.1.2-cuda11.8-cudnn8-devel-llama_factory ``` - 原生安裝所需套件 ``` # 套件安裝ubuntu相關套件 apt install libfontconfig libaio-dev libibverbs-dev jq # 安裝 Llama-factory 相關套件 pip install llmtuner==0.5.3 deepspeed==0.13.1 bitsandbytes==0.42.0 opencc opencc-python-reimplemented ``` ### 程式碼 C03_LLaMA-Factory-Cmd-lora_nNodes.ipynb ``` %%bash ## 請記得修HF_TOKEN='hf_' , 給予huggingface token ## 將模型微調指令寫成 demo.cmd ## 特別注意虛擬絕對路徑部分 /DEEPSPEED/LLaMA-Factory (為你目前執行目錄下必須有LLaMA-Factory目錄) ## --model_name_or_path meta-llama/Llama-2-7b-hf ## --model_name_or_path /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf ## --deepspeed examples/full_multi_gpu/ds_z3_config.json \ cat << \EOF > demo.cmd #!/bin/bash # 虛擬絕對路徑 cd /DEEPSPEED/LLaMA-Factory ## 清空之前計算 ps -ef |grep 'train_bash.py' | awk '{print $2}' | xargs kill -9 rm -rf ../saves/LLaMA2-7B/lora/sft sleep 10 ## 微調程式 torchrun \ --nproc_per_node ${GPUS_PER_NODE} \ --master_addr ${MASTER_ADDR} \ --master_port ${MASTER_PORT} \ --nnodes ${SLURM_NNODES} \ --node_rank ${SLURM_PROCID} \ src/train_bash.py \ --deepspeed examples/full_multi_gpu/ds_z2_config.json \ --stage sft \ --do_train \ --model_name_or_path /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf \ --dataset alpaca_gpt4_zh \ --dataset_dir data \ --template default \ --finetuning_type lora \ --lora_target q_proj,v_proj \ --output_dir ../saves/LLaMA2-7B/lora/sft \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \ --load_best_model_at_end \ --learning_rate 5e-5 \ --num_train_epochs 3.0 \ --max_samples 1000 \ --val_size 0.1 \ --plot_loss \ --fp16 EOF ## 改變執行檔權限並印出 chmod 755 demo.cmd #cat demo.cmd ``` - slrum job ``` %%bash # SLURM 工作配送, 建議可修改的數值如下列三個 # --nodes=2 (2台電腦) # --gres=gpu:4 (每台電腦4顆GPU) # -c 16 (每台電腦 16 CORE CPU) ## 將 << \EOF > .... EOF 之間文字 儲存成 demo.slurm cat << \EOF > demo.slurm #!/bin/bash #SBATCH -A MST110386 ### project number, Example MST109178 #SBATCH -J _t2demo_ ### Job name, Exmaple jupyterlab #SBATCH -p gp4d ### Partition Name, Example ngs1gpu #SBATCH --nodes=2 ### Nodes, Default 1, node number #SBATCH --ntasks-per-node=1 ### Tasks, Default 1, per node tasks #SBATCH -c 16 ### Cores assigned to each task, Example 4 #SBATCH --gres=gpu:4 ### GPU number, Example gpu:1 #SBATCH --time=0-1:00:00 ### Runnung time #SBATCH -o demo.out ### Log folder, Here %j is job ID #SBATCH -e demo.err # 環境變數 ## GPU數量 export GPUS_PER_NODE=4 ## 主要機器HOSTNAME or IP export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) ## 主要機器PORT export MASTER_PORT=$(python -c "import socket; s = socket.socket(socket.AF_INET, socket.SOCK_STREAM); s.bind(('', 0)); addr = s.getsockname(); s.close(); print(addr[1])") ## NCCL 傳輸介面更動, 預設為ib (ib1,2,3,4全包), 你可以更動為 vlan304(一般網卡) ###export NCCL_SOCKET_IFNAME=vlan304 # Singularity 容器執行上方demo.cmd ## $PWD:/DEEPSPEED 將目前目錄掛載成虛擬目錄 /DEEPSPEED ## 模型檔案位置需要掛載 /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf srun singularity exec --nv \ -B $PWD:/DEEPSPEED \ -B /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf \ /work/u00cjz00/nvidia/cuda118/c00cjz00_cuda11.8_pytorch_2.1.2-cuda11.8-cudnn8-devel-llama_factory.sif \ bash -c '/DEEPSPEED/demo.cmd' EOF # 改變執行檔demo.slurm權限 chmod 755 demo.slurm # 印出demo.slurm內容 ##cat demo.slurm ``` - 開始配送工作 (請將下面 # 取消, 進行工作派送) 執行後會印出 JOBID, 可於中途刪除工作時使用 ``` !sbatch demo.slurm ``` - 觀看派送結果 ``` !echo "觀看派送結果" !squeue -u $(whoami) ``` - 印出運算過程訊息 ``` !echo "印出派送運算訊息" !tail -f demo.out demo.err ``` - 刪除工作 scancel $JOBID ``` !scancel $JOBID ``` ### dz_z2.config.json ``` { "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "zero_allow_untested_optimizer": true, "fp16": { "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled": "auto" }, "zero_optimization": { "stage": 2, "allgather_partitions": true, "allgather_bucket_size": 5e8, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 5e8, "contiguous_gradients": true, "round_robin_gradients": true } } ```