# FL 觀察檔案生成 跑了兩個訓練(hello-pt 和 hello-pt-tb),MONAI 跟 hello-tf2 環境需要另外安裝 ``` pip3 install monai pip3 install tensorflow ``` ## Server: 訓練的時候會生成一個資料夾(名稱是job_id),結束訓練時會**自動銷毀** 主要的檔案路徑: ``` /poc/server/transfer/job-id/workspace ``` * FL_global_model.pt -- 全域模型(/app_server) * cross_val_results.json -- 紀錄各站點訓練的準確率(? (/cross_site_val) ![](https://i.imgur.com/3QiIlFc.png) ![](https://i.imgur.com/R9cdPRi.png) ## Client: 訓練的時候會生成一個資料夾(名稱是job_id),結束訓練時會不會銷毀,有自己的本地model 主要的檔案路徑: ``` /poc/site-n/transfer/job-id ``` * local_model.pt -- 本地模型(/models) ![](https://i.imgur.com/xr5wCpg.png) ## Admin: 需先使用download_job指令將server的模型下載到admin的資料夾內 主要的檔案路徑: (訓練完的檔案內容和server的完全一樣) ``` /poc/admin/transfer/job-id/workspace/ ``` ![](https://i.imgur.com/2Mgv3Vq.png) ## 新的nvflare_reservation_slurm.sh ```shell #Example # bash nvflare_reservation.sh hpc-site-1,hpc-site-2 hospital-ncku-1,hospital-ncku-2 ## argv for hpc and hospital hpcList=$1 hospitalList=$2 ## slurmQueue function slurmQueue(){ #FILE="2.txt" if [ "$queue" == "gpu" ]; then $(cat > $FILE <<-END #!/bin/bash #SBATCH -A MST110490 # Account name/project number #SBATCH -J nvflare_job # Job name #SBATCH -p ngs1gpu # Partition Name 等同PBS裡面的 -q Queue name #SBATCH -c 6 # 使用的core數 請參考Queue資源設定 #SBATCH --mem=90g # 使用的記憶體量 請參考Queue資源設定 #SBATCH --gres=gpu:1 # 使用的GPU數 請參考Queue資源設定 #SBATCH --mail-user=summerhill001@gmail.com # email #SBATCH --mail-type=BEGIN,END # 指定送出email時機 可為NONE, BEGIN, END, FAIL, REQUEUE, ALL #SBATCH -o log/%j.logi # Path to the standard output and error files relative to the working directory END ) else $(cat > $FILE <<-END #!/bin/bash #SBATCH -A MST110490 # Account name/project number #SBATCH -J nvflare_job # Job name #SBATCH -p ngs7G # Partition Name 等同PBS裡面的 -q Queue name #SBATCH -c 2 # 使用的core數 請參考Queue資源設定 #SBATCH --mem=7g # 使用的記憶體量 請參考Queue資源設定 #SBATCH --mail-user=summerhill001@gmail.com # email #SBATCH --mail-type=BEGIN,END # 指定送出email時機 可為NONE, BEGIN, END, FAIL, REQUEUE, ALL #SBATCH -o log/%j.logi # Path to the standard output and error files relative to the working directory END ) fi } ## session num nvflare_session=$(date '+%Y%m%d%H%M%S') reservationFolder=/work/$(whoami)/nvflare/reservation_${nvflare_session} mkdir -p ${reservationFolder} touch ${reservationFolder}/serverDomain.txt # 新增一個照時間命名的資料夾 local_folder_name="nvflare"_$(date '+%Y%m%d%H%M%S') mkdir $local_folder_name ####### ## S3 function $HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 sync ${reservationFolder} s3://nvflare2/reservation_${nvflare_session} url_serverDomain=$($HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 presign s3://nvflare2/reservation_${nvflare_session}/serverDomain.txt --expires-in 604800) #echo $url_serverDomain ## make server script wget https://covid-19.nchc.org.tw/mynvflare/template_server.sh -O server.sh sed -i "1i url=\"$url_serverDomain\"" server.sh sed -i "s|nvflare/res|nvflare2/res|g" server.sh FILE="/tmp/queue.txt"; queue="cpu"; slurmQueue; mv server.sh server.sh.tmp cat /tmp/queue.txt server.sh.tmp > server.sh dos2unix server.sh rm server.sh.tmp $HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 cp server.sh s3://nvflare2/reservation_${nvflare_session}/server.sh url_server_script=$($HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 presign s3://nvflare2/reservation_${nvflare_session}/server.sh --expires-in 604800) #echo $url_server_script nodeArr+=($url_server_script) # 新增 mv server.sh $local_folder_name #移動檔案到資料夾 ## make admin script wget https://covid-19.nchc.org.tw/mynvflare/template_admin.sh -O admin.sh sed -i "1i url=\"$url_serverDomain\"" admin.sh dos2unix admin.sh $HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 cp admin.sh s3://nvflare2/reservation_${nvflare_session}/admin.sh url_admin_script=$($HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 presign s3://nvflare2/reservation_${nvflare_session}/admin.sh --expires-in 604800) echo $url_admin_script nodeArr+=($url_admin_script) #新增 mv admin.sh $local_folder_name #移動檔案到資料夾 ## make hpc-site script if [ "$hpcList" != "none" ]; then hpcArray=(${hpcList//,/ }) #hpcArray=("hpc-site-1" "hpc-site-2" "hpc-site-3") for Site_name in ${hpcArray[@]}; do #Site_name=hpc-site-1 wget https://covid-19.nchc.org.tw/mynvflare/template_hpc.sh -O ${Site_name}.sh sed -i "1i url=\"$url_serverDomain\"" ${Site_name}.sh sed -i "s|hpc-site-1|$Site_name|g" ${Site_name}.sh FILE="/tmp/queue.txt"; queue="gpu"; slurmQueue; mv ${Site_name}.sh ${Site_name}.sh.tmp cat /tmp/queue.txt ${Site_name}.sh.tmp > ${Site_name}.sh dos2unix ${Site_name}.sh rm ${Site_name}.sh.tmp $HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 cp ${Site_name}.sh s3://nvflare2/reservation_${nvflare_session}/${Site_name}.sh url_hpcsite_script=$($HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 presign s3://nvflare2/reservation_${nvflare_session}/${Site_name}.sh --expires-in 604800) #echo ${url_hpcsite_script} nodeArr+=($url_hpcsite_script) #新增 mv ${Site_name}.sh $local_folder_name #移動檔案到資料夾 done fi ## make hospital-site script if [ "$hospitalList" != "none" ]; then hospitalArray=(${hospitalList//,/ }) #hospitalArray=("hospital-site-1" "hospital-site-2" "hospital-site-3") for Site_name in ${hospitalArray[@]}; do #Site_name=hospital-site-1 wget https://covid-19.nchc.org.tw/mynvflare/template_hospital.sh -O ${Site_name}.sh sed -i "1i url=\"$url_serverDomain\"" ${Site_name}.sh sed -i "s|hospital-site-1|$Site_name|g" ${Site_name}.sh dos2unix ${Site_name}.sh $HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 cp ${Site_name}.sh s3://nvflare2/reservation_${nvflare_session}/${Site_name}.sh url_hospitalsite_script=$($HOME/local/bin/aws --endpoint-url=http://s3.twcc.ai s3 presign s3://nvflare2/reservation_${nvflare_session}/${Site_name}.sh --expires-in 604800) #echo $url_hospitalsite_script nodeArr+=($url_hospitalsite_script) done fi for nodeurl in ${nodeArr[@]}; do echo $nodeurl done ```