# Run Intel Pytorch Extension on i7-1370PE CPU and iGPU
## Setup Environment for CPU
```=bash
sudo apt install wget curl vim git
python3 -m venv ipex_cpu
source ipex_cpu/bin/activate
python -m pip install intel_extension_for_pytorch -f https://developer.intel.com/ipex-whl-stable-cpu
pip install torch torchvision
```
## Run cpu FP32 training
```=bash
source /opt/intel/oneapi/setvars.sh
source ipex_cpu/bin/activate
python train_fp32_gpu.py
```
## Setup Environment for iGPU
```=bash
# https://dgpu-docs.intel.com/driver/installation.html
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
sudo apt update
sudo apt install intel-basekit
# level-zero is required to run on 'xpu'
wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | sudo gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/production/2328 unified" | sudo tee /etc/apt/sources.list.d/intel-gpu-jammy.list
sudo apt update
sudo apt install level-zero
# OpenCL and level-zero runtime. Required by OpenVINO also
# https://github.com/intel/compute-runtime/releases/tag/23.30.26918.9
mkdir neo
cd neo/
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.14828.8/intel-igc-core_1.0.14828.8_amd64.deb
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.14828.8/intel-igc-opencl_1.0.14828.8_amd64.deb
wget https://github.com/intel/compute-runtime/releases/download/23.30.26918.9/intel-level-zero-gpu-dbgsym_1.3.26918.9_amd64.ddeb
wget https://github.com/intel/compute-runtime/releases/download/23.30.26918.9/intel-level-zero-gpu_1.3.26918.9_amd64.deb
wget https://github.com/intel/compute-runtime/releases/download/23.30.26918.9/intel-opencl-icd-dbgsym_23.30.26918.9_amd64.ddeb
wget https://github.com/intel/compute-runtime/releases/download/23.30.26918.9/intel-opencl-icd_23.30.26918.9_amd64.deb
wget https://github.com/intel/compute-runtime/releases/download/23.30.26918.9/libigdgmm12_22.3.0_amd64.deb
python3 -m venv ipex
source ipex/bin/activate
python -m pip install torch==2.0.1a0 torchvision==0.15.2a0 intel_extension_for_pytorch==2.0.110+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
```
## Run xpu FP32 training
```=bash
source /opt/intel/oneapi/setvars.sh
source ipex/bin/activate
python train_fp32_gpu.py
```
### sycl-ls log
```
eapet@eapet-RockIsland:~$ source /opt/intel/oneapi/setvars.sh
:: initializing oneAPI environment ...
bash: BASH_VERSION = 5.1.16(1)-release
args: Using "$@" for setvars.sh arguments:
:: advisor -- latest
:: ccl -- latest
:: compiler -- latest
:: dal -- latest
:: debugger -- latest
:: dev-utilities -- latest
:: dnnl -- latest
:: dpcpp-ct -- latest
:: dpl -- latest
:: ipp -- latest
:: ippcp -- latest
:: ipp -- latest
:: mkl -- latest
:: mpi -- latest
:: tbb -- latest
:: vtune -- latest
:: oneAPI environment initialized ::
(ipex) eapet@eapet-RockIsland:~$ source ipex/bin/activate
(ipex) eapet@eapet-RockIsland:~$ sycl-ls
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device 1.2 [2023.16.7.0.21_160000]
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-1370PE 3.0 [2023.16.7.0.21_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Graphics [0xa7a0] 3.0 [23.30.26918.9]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Graphics [0xa7a0] 1.3 [1.3.26918]
```
### train_fp32_cpu.py
```=python
import torch
import torchvision
import intel_extension_for_pytorch as ipex
LR = 0.001
DOWNLOAD = True
DATA = 'datasets/cifar10/'
transform = torchvision.transforms.Compose([
torchvision.transforms.Resize((224, 224)),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_dataset = torchvision.datasets.CIFAR10(
root=DATA,
train=True,
transform=transform,
download=DOWNLOAD,
)
train_loader = torch.utils.data.DataLoader(
dataset=train_dataset,
batch_size=128
)
model = torchvision.models.resnet50()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9)
model.train()
model, optimizer = ipex.optimize(model, optimizer=optimizer)
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
print(batch_idx)
torch.save({
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
}, 'checkpoint.pth')
```
### train_fp32_gpu.py
```=python
import torch
import torchvision
############# code changes ###############
import intel_extension_for_pytorch as ipex
############# code changes ###############
LR = 0.001
DOWNLOAD = True
DATA = "datasets/cifar10/"
transform = torchvision.transforms.Compose(
[
torchvision.transforms.Resize((224, 224)),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
train_dataset = torchvision.datasets.CIFAR10(
root=DATA,
train=True,
transform=transform,
download=DOWNLOAD,
)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128)
model = torchvision.models.resnet50()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9)
model.train()
######################## code changes #######################
model = model.to("xpu")
criterion = criterion.to("xpu")
model, optimizer = ipex.optimize(model, optimizer=optimizer)
######################## code changes #######################
for batch_idx, (data, target) in enumerate(train_loader):
########## code changes ##########
data = data.to("xpu")
target = target.to("xpu")
########## code changes ##########
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
print(batch_idx)
torch.save(
{
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},
"checkpoint.pth",
)
print("Execution finished")
```

# Stable Diffusion
https://huggingface.co/docs/diffusers/quicktour
https://github.com/huggingface/diffusers/tree/main/examples/text_to_image
## LORA on CPU
### Prepare diffuser
```
git clone https://github.com/huggingface/diffusers
cd diffusers/
pip install .
cd examples/text_to_image/
pip install -r requirements.txt
```
## !!!Please use SD-1.5 below instead !!!
```=bash
source ipex_cpu/bin/activate
cd diffusers/examples/text_to_image/
accelerate config
huggingface-cli login
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
accelerate launch train_text_to_image_lora.py --pretrained_model_name_or_path=$MODEL_NAME --dataset_name=$DATASET_NAME --caption_column="text" --resolution=512 --random_flip --train_batch_size=1 --num_train_epochs=100 --checkpointing_steps=5000 --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 --seed=42 --output_dir="sd-pokemon-model-lora" --validation_prompt="cute dragon creature"
```

## LORA on GPU
```=bash
source /opt/intel/oneapi/setvars.sh
source ipex/bin/activate
cd diffusers/examples/text_to_image/
accelerate config
huggingface-cli login
export MODEL_NAME="CompVis/stable-diffusion-v1-4"
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
accelerate launch --mixed_precision="no" train_text_to_image_lora.py --pretrained_model_name_or_path=$MODEL_NAME --dataset_name=$DATASET_NAME --caption_column="text" --resolution=512 --random_flip --train_batch_size=1 --num_train_epochs=100 --checkpointing_steps=5000 --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 --seed=42 --output_dir="sd-pokemon-model-lora" --validation_prompt="cute dragon creature"
```

## Reference
https://huggingface.co/docs/accelerate/usage_guides/ipex
https://huggingface.co/docs/diffusers/main/en/training/lora
```=bash
export MODEL_NAME="runwayml/stable-diffusion-v1-5"
export OUTPUT_DIR="sd-pokemon-model-loar_v1.5"
export DATASET_NAME="lambdalabs/pokemon-blip-captions"
accelerate launch --mixed_precision="bf16" train_text_to_image_lora.py \
--pretrained_model_name_or_path=$MODEL_NAME \
--dataset_name=$DATASET_NAME \
--dataloader_num_workers=8 \
--resolution=512 --center_crop --random_flip \
--train_batch_size=1 \
--gradient_accumulation_steps=4 \
--max_train_steps=15000 \
--learning_rate=1e-04 \
--max_grad_norm=1 \
--lr_scheduler="cosine" --lr_warmup_steps=0 \
--output_dir=${OUTPUT_DIR} \
--checkpointing_steps=500 \
--validation_prompt="A pokemon with blue eyes." \
--seed=1337
```
### patch
```
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 3155eb3a..7246ccc7 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -862,7 +862,8 @@ def main():
pipeline.set_progress_bar_config(disable=True)
# run inference
- generator = torch.Generator(device=accelerator.device)
+ #generator = torch.Generator(device=accelerator.device)
+ generator = torch.Generator()
if args.seed is not None:
generator = generator.manual_seed(args.seed)
images = []
@@ -920,7 +921,8 @@ def main():
pipeline.unet.load_attn_procs(args.output_dir)
# run inference
- generator = torch.Generator(device=accelerator.device)
+ #generator = torch.Generator(device=accelerator.device)
+ generator = torch.Generator()
if args.seed is not None:
generator = generator.manual_seed(args.seed)
images = []
```
Using ~6GB memory to train LORA for SD-1.5 on Intel Arc A770M

After one epoch,

```
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ ls -la
total 3180
drwxrwxr-x 7 eapet eapet 4096 十 3 04:18 .
drwxrwxr-x 3 eapet eapet 4096 九 13 00:12 ..
drwxrwxr-x 2 eapet eapet 4096 十 3 02:00 checkpoint-1000
drwxrwxr-x 2 eapet eapet 4096 十 3 03:09 checkpoint-1500
drwxrwxr-x 2 eapet eapet 4096 十 3 04:18 checkpoint-2000
drwxrwxr-x 2 eapet eapet 4096 十 3 00:50 checkpoint-500
drwxrwxr-x 3 eapet eapet 4096 九 13 00:18 logs
-rw-rw-r-- 1 eapet eapet 3227336 十 2 23:40 pytorch_lora_weights.safetensors
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ ls -la checkpoint-2000/
total 9680
drwxrwxr-x 2 eapet eapet 4096 十 3 04:18 .
drwxrwxr-x 7 eapet eapet 4096 十 3 04:18 ..
-rw-rw-r-- 1 eapet eapet 6591173 十 3 04:18 optimizer.bin
-rw-rw-r-- 1 eapet eapet 3285709 十 3 04:18 pytorch_model.bin
-rw-rw-r-- 1 eapet eapet 14219 十 3 04:18 random_states_0.pkl
-rw-rw-r-- 1 eapet eapet 563 十 3 04:18 scheduler.bin
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ ls -la checkpoint-1500/
total 9680
drwxrwxr-x 2 eapet eapet 4096 十 3 03:09 .
drwxrwxr-x 7 eapet eapet 4096 十 3 04:18 ..
-rw-rw-r-- 1 eapet eapet 6591173 十 3 03:09 optimizer.bin
-rw-rw-r-- 1 eapet eapet 3285709 十 3 03:09 pytorch_model.bin
-rw-rw-r-- 1 eapet eapet 14219 十 3 03:09 random_states_0.pkl
-rw-rw-r-- 1 eapet eapet 563 十 3 03:09 scheduler.bin
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ ls -la logs/
total 12
drwxrwxr-x 3 eapet eapet 4096 九 13 00:18 .
drwxrwxr-x 7 eapet eapet 4096 十 3 04:18 ..
drwxrwxr-x 42 eapet eapet 4096 十 2 23:42 text2image-fine-tune
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ ls -la logs/text2image-fine-tune/
total 18188
drwxrwxr-x 42 eapet eapet 4096 十 2 23:42 .
drwxrwxr-x 3 eapet eapet 4096 九 13 00:18 ..
drwxrwxr-x 2 eapet eapet 4096 九 13 00:18 1694535537.1758006
drwxrwxr-x 2 eapet eapet 4096 九 13 00:18 1694535537.1768963
drwxrwxr-x 2 eapet eapet 4096 九 13 00:20 1694535657.6677988
drwxrwxr-x 2 eapet eapet 4096 九 13 00:20 1694535657.6688378
drwxrwxr-x 2 eapet eapet 4096 九 14 14:31 1694673090.2191353
drwxrwxr-x 2 eapet eapet 4096 九 14 14:31 1694673090.2200518
drwxrwxr-x 2 eapet eapet 4096 九 14 14:38 1694673524.61646
drwxrwxr-x 2 eapet eapet 4096 九 14 14:38 1694673524.6173496
drwxrwxr-x 2 eapet eapet 4096 九 14 15:02 1694674938.6376064
drwxrwxr-x 2 eapet eapet 4096 九 14 15:02 1694674938.638701
drwxrwxr-x 2 eapet eapet 4096 九 15 09:33 1694741583.638871
drwxrwxr-x 2 eapet eapet 4096 九 15 09:33 1694741583.6398134
drwxrwxr-x 2 eapet eapet 4096 十 2 22:05 1696255524.0575862
drwxrwxr-x 2 eapet eapet 4096 十 2 22:05 1696255524.0586967
drwxrwxr-x 2 eapet eapet 4096 十 2 22:15 1696256153.9365995
drwxrwxr-x 2 eapet eapet 4096 十 2 22:15 1696256153.9373424
drwxrwxr-x 2 eapet eapet 4096 十 2 22:16 1696256187.956845
drwxrwxr-x 2 eapet eapet 4096 十 2 22:16 1696256187.95803
drwxrwxr-x 2 eapet eapet 4096 十 2 22:16 1696256210.1867187
drwxrwxr-x 2 eapet eapet 4096 十 2 22:16 1696256210.1878788
drwxrwxr-x 2 eapet eapet 4096 十 2 22:17 1696256268.4659348
drwxrwxr-x 2 eapet eapet 4096 十 2 22:17 1696256268.4670677
drwxrwxr-x 2 eapet eapet 4096 十 2 22:18 1696256326.097911
drwxrwxr-x 2 eapet eapet 4096 十 2 22:18 1696256326.0989985
drwxrwxr-x 2 eapet eapet 4096 十 2 22:49 1696258179.1243324
drwxrwxr-x 2 eapet eapet 4096 十 2 22:49 1696258179.1251698
drwxrwxr-x 2 eapet eapet 4096 十 2 23:19 1696259954.609776
drwxrwxr-x 2 eapet eapet 4096 十 2 23:19 1696259954.6105235
drwxrwxr-x 2 eapet eapet 4096 十 2 23:26 1696260408.558585
drwxrwxr-x 2 eapet eapet 4096 十 2 23:26 1696260408.559681
drwxrwxr-x 2 eapet eapet 4096 十 2 23:31 1696260675.3928468
drwxrwxr-x 2 eapet eapet 4096 十 2 23:31 1696260675.3938432
drwxrwxr-x 2 eapet eapet 4096 十 2 23:36 1696261017.1515415
drwxrwxr-x 2 eapet eapet 4096 十 2 23:36 1696261017.1526303
drwxrwxr-x 2 eapet eapet 4096 十 2 23:38 1696261111.0283754
drwxrwxr-x 2 eapet eapet 4096 十 2 23:38 1696261111.0290744
drwxrwxr-x 2 eapet eapet 4096 十 2 23:39 1696261185.4160953
drwxrwxr-x 2 eapet eapet 4096 十 2 23:39 1696261185.416765
drwxrwxr-x 2 eapet eapet 4096 十 2 23:42 1696261348.9385016
drwxrwxr-x 2 eapet eapet 4096 十 2 23:42 1696261348.9395854
-rw-rw-r-- 1 eapet eapet 232 九 13 00:19 events.out.tfevents.1694535537.eapet-NUC12SNKi72.12395.0
-rw-rw-r-- 1 eapet eapet 184 九 13 00:21 events.out.tfevents.1694535657.eapet-NUC12SNKi72.13873.0
-rw-rw-r-- 1 eapet eapet 808 九 14 14:33 events.out.tfevents.1694673090.eapet-NUC12SNKi72.3698.0
-rw-rw-r-- 1 eapet eapet 2968 九 14 14:47 events.out.tfevents.1694673524.eapet-NUC12SNKi72.7929.0
-rw-rw-r-- 1 eapet eapet 4264 九 14 15:14 events.out.tfevents.1694674938.eapet-NUC12SNKi72.12240.0
-rw-rw-r-- 1 eapet eapet 88 九 15 09:33 events.out.tfevents.1694741583.eapet-NUC12SNKi72.6801.0
-rw-rw-r-- 1 eapet eapet 3688 十 2 22:15 events.out.tfevents.1696255524.eapet-NUC12SNKi72.6476.0
-rw-rw-r-- 1 eapet eapet 88 十 2 22:15 events.out.tfevents.1696256153.eapet-NUC12SNKi72.9031.0
-rw-rw-r-- 1 eapet eapet 88 十 2 22:16 events.out.tfevents.1696256187.eapet-NUC12SNKi72.9802.0
-rw-rw-r-- 1 eapet eapet 88 十 2 22:16 events.out.tfevents.1696256210.eapet-NUC12SNKi72.10593.0
-rw-rw-r-- 1 eapet eapet 88 十 2 22:17 events.out.tfevents.1696256268.eapet-NUC12SNKi72.11400.0
-rw-rw-r-- 1 eapet eapet 10202 十 2 22:46 events.out.tfevents.1696256326.eapet-NUC12SNKi72.12222.0
-rw-rw-r-- 1 eapet eapet 10202 十 2 23:17 events.out.tfevents.1696258179.eapet-NUC12SNKi72.13601.0
-rw-rw-r-- 1 eapet eapet 1384 十 2 23:23 events.out.tfevents.1696259954.eapet-NUC12SNKi72.14448.0
-rw-rw-r-- 1 eapet eapet 808 十 2 23:28 events.out.tfevents.1696260408.eapet-NUC12SNKi72.15436.0
-rw-rw-r-- 1 eapet eapet 2942949 十 2 23:32 events.out.tfevents.1696260675.eapet-NUC12SNKi72.16893.0
-rw-rw-r-- 1 eapet eapet 136 十 2 23:37 events.out.tfevents.1696261017.eapet-NUC12SNKi72.18273.0
-rw-rw-r-- 1 eapet eapet 136 十 2 23:38 events.out.tfevents.1696261111.eapet-NUC12SNKi72.19062.0
-rw-rw-r-- 1 eapet eapet 2943748 十 2 23:41 events.out.tfevents.1696261185.eapet-NUC12SNKi72.19851.0
-rw-rw-r-- 1 eapet eapet 12466056 十 3 04:43 events.out.tfevents.1696261348.eapet-NUC12SNKi72.20660.0
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ ls -la logs/text2image-fine-tune/1694535537.1758006
total 12
drwxrwxr-x 2 eapet eapet 4096 九 13 00:18 .
drwxrwxr-x 42 eapet eapet 4096 十 2 23:42 ..
-rw-rw-r-- 1 eapet eapet 2258 九 13 00:18 events.out.tfevents.1694535537.eapet-NUC12SNKi72.12395.1
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ vim logs/text2image-fine-tune/1694535537.1758006/events.out.tfevents.1694535537.eapet-NUC12SNKi72.12395.1
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ vim logs/text2image-fine-tune/events.out.tfevents.169626
events.out.tfevents.1696260408.eapet-NUC12SNKi72.15436.0 events.out.tfevents.1696261017.eapet-NUC12SNKi72.18273.0 events.out.tfevents.1696261185.eapet-NUC12SNKi72.19851.0
events.out.tfevents.1696260675.eapet-NUC12SNKi72.16893.0 events.out.tfevents.1696261111.eapet-NUC12SNKi72.19062.0 events.out.tfevents.1696261348.eapet-NUC12SNKi72.20660.0
eapet@eapet-NUC12SNKi72:~/finetune/lora/pokemon$ vim logs/text2image-fine-tune/events.out.tfevents.1696261111.eapet-NUC12SNKi72.19062.0
```

## Performance improvement plan
https://github.com/intel/intel-extension-for-pytorch/issues/296
diffuser can't use IPEX_XPU_ONEDNN_LAYOUT=1 (channel last)?
## Another SD finetune example
https://huggingface.co/blog/stable-diffusion-finetuning-intel