# 程式碼-多顆GPUs微調模型 ## 多顆GPUs微調模型 ## :memo: Where do I start? - 聯絡窗口 Email us : 2303117@narlabs.org.tw 王小姐 ### 事前準備 - 映像檔下載 ``` singularity pull c00cjz00/c00cjz00_cuda11.8_pytorch:2.1.2-cuda11.8-cudnn8-devel-llama_factory ``` - 原生安裝所需套件 ``` # 套件安裝ubuntu相關套件 apt install libfontconfig libaio-dev libibverbs-dev jq # 安裝 Llama-factory 相關套件 pip install llmtuner==0.5.3 deepspeed==0.13.1 bitsandbytes==0.42.0 opencc opencc-python-reimplemented ``` ### 程式碼 C02_LLaMA-Factory-Cmd-lora_nGPUs.ipynb ``` %%bash ## 請記得修HF_TOKEN='hf_' , 給予huggingface token ## 將模型微調指令寫成 demo.cmd ## 特別注意虛擬絕對路徑部分 /DEEPSPEED/LLaMA-Factory (為你目前執行目錄下必須有LLaMA-Factory目錄) ## --model_name_or_path meta-llama/Llama-2-7b-hf ## --model_name_or_path /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf ## --deepspeed examples/full_multi_gpu/ds_z3_config.json \ cat << \EOF > demo.cmd #!/bin/bash # 虛擬絕對路徑 cd /DEEPSPEED/LLaMA-Factory ## 清空之前計算 ps -ef |grep 'train_bash.py' | awk '{print $2}' | xargs kill -9 rm -rf ../saves/LLaMA2-7B/lora/sft sleep 10 ## 微調程式 deepspeed --num_gpus ${GPUS_PER_NODE} src/train_bash.py \ --deepspeed examples/full_multi_gpu/ds_z2_config.json \ --stage sft \ --do_train \ --model_name_or_path /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf \ --dataset alpaca_gpt4_zh \ --dataset_dir data \ --template default \ --finetuning_type lora \ --lora_target q_proj,v_proj \ --output_dir ../saves/LLaMA2-7B/lora/sft \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 8 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \ --load_best_model_at_end \ --learning_rate 5e-5 \ --num_train_epochs 3.0 \ --max_samples 1000 \ --val_size 0.1 \ --plot_loss \ --fp16 EOF ## 改變執行檔權限並印出 chmod 755 demo.cmd #cat demo.cmd ``` - slrum job ``` %%bash # SLURM 工作配送 # --nodes=2 (2台電腦) # --gres=gpu:2 (每台電腦2顆GPU) # -c 8 (每台電腦 8 CORE CPU) ## 將 << \EOF > .... EOF 之間文字 儲存成 demo.slurm cat << \EOF > demo.slurm #!/bin/bash #SBATCH -A MST110386 ### project number, Example MST109178 #SBATCH -J _t2demo_ ### Job name, Exmaple jupyterlab #SBATCH -p gp4d ### Partition Name, Example ngs1gpu #SBATCH --nodes=1 ### Nodes, Default 1, node number #SBATCH --ntasks-per-node=1 ### Tasks, Default 1, per node tasks #SBATCH -c 32 ### Cores assigned to each task, Example 4 #SBATCH --gres=gpu:8 ### GPU number, Example gpu:1 #SBATCH --time=0-1:00:00 ### Runnung time, days-hours:minutes:seconds or hours:minutes:seconds #SBATCH -o demo.out ### Log folder, Here %j is job ID #SBATCH -e demo.err ## 環境變數 export GPUS_PER_NODE=8 ## 用Singularity 容器執行上方demo.cmd ## $PWD:/DEEPSPEED 交目前目錄掛載成虛擬目錄 /DEEPSPEED, 對應上個CELL 虛擬絕對路徑 cd /DEEPSPEED/LLaMA-Factory srun singularity exec --nv \ -B $PWD:/DEEPSPEED \ -B /work/u00cjz00/slurm_jobs/github/models/Llama-2-7b-chat-hf \ /work/u00cjz00/nvidia/cuda118/c00cjz00_cuda11.8_pytorch_2.1.2-cuda11.8-cudnn8-devel-llama_factory.sif \ bash -c '/DEEPSPEED/demo.cmd' EOF ## 改變執行檔權限並印出 chmod 755 demo.slurm #cat demo.slurm ``` - 開始配送工作 (請將下面 # 取消, 進行工作派送) 執行後會印出 JOBID, 可於中途刪除工作時使用 ``` !sbatch demo.slurm ``` - 觀看派送結果 ``` !echo "觀看派送結果" !squeue -u $(whoami) ``` - 印出運算過程訊息 ``` !echo "印出派送運算訊息" !tail -f demo.out demo.err ``` - 刪除工作 scancel $JOBID ``` !scancel $JOBID ``` ### dz_z2.config.json ``` { "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "zero_allow_untested_optimizer": true, "fp16": { "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "bf16": { "enabled": "auto" }, "zero_optimization": { "stage": 2, "allgather_partitions": true, "allgather_bucket_size": 5e8, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 5e8, "contiguous_gradients": true, "round_robin_gradients": true } } ```