# ATS-M1 on SMC Xeon 2022.2 Setup

### Use openvino-dev=2022.2 in intel-mass-ubuntu20.04-stable docker
sudo docker run --user root -it --rm --device=/dev/dri/ --cap-add SYS_ADMIN -v /dev/dri/by-path:/dev/dri/by-path -v /intel/3_Media_Analytics/content/:/intel/3_Media_Analytics/content intel-mass-ubuntu20.04-stable /bin/bash
```=bash
apt update
apt upgrade
apt install python3.8-venv
python3 -m venv openvino_env
source openvino_env/bin/activate
python -m pip install --upgrade pip
pip list
pip install openvino-dev
export LD_LIBRARY_PATH=/root/openvino_env/lib/python3.8/site-packages/openvino/libs:$LD_LIBRARY_PATH
mo -h
python -c "from openvino.runtime import Core"
benchmark_app -m /intel/3_Media_Analytics/content/openvino_models/public/yolo-v4-tf/FP16-INT8/yolo-v4-tf.xml -d GPU.0 -b 1 -nstream 4 -hint none
```
### Run Benchmark
```
(openvino_env) root@693e98f95bf1:~# benchmark_app -m /intel/3_Media_Analytics/content/openvino_models/public/yolo-v4-tf/FP16-INT8/yolo-v4-tf.xml -d GPU.0 -b 1 -nstream 8 -hint none
[Step 1/11] Parsing and validating input arguments
[Step 2/11] Loading OpenVINO
hwconfig key 77 (UNKNOWN_INTEL_HWCONFIG) unhandled!
hwconfig key 78 (UNKNOWN_INTEL_HWCONFIG) unhandled!
hwconfig key 79 (UNKNOWN_INTEL_HWCONFIG) unhandled!
hwconfig key 80 (UNKNOWN_INTEL_HWCONFIG) unhandled!
hwconfig key 77 (UNKNOWN_INTEL_HWCONFIG) unhandled!
hwconfig key 78 (UNKNOWN_INTEL_HWCONFIG) unhandled!
hwconfig key 79 (UNKNOWN_INTEL_HWCONFIG) unhandled!
hwconfig key 80 (UNKNOWN_INTEL_HWCONFIG) unhandled!
[ WARNING ] No device GPU.0 performance hint is set.
[ INFO ] OpenVINO:
API version............. 2022.2.0-7713-af16ea1d79a-releases/2022/2
[ INFO ] Device info
GPU
Intel GPU plugin........ version 2022.2
Build................... 2022.2.0-7713-af16ea1d79a-releases/2022/2
[Step 3/11] Setting device configuration
[Step 4/11] Reading network files
[ INFO ] Read model took 98.94 ms
[Step 5/11] Resizing network to match image sizes and given batch
[ INFO ] Network batch size: 1
[Step 6/11] Configuring input of the model
[ INFO ] Model input 'image_input' precision u8, dimensions ([N,H,W,C]): 1 608 608 3
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_542:0' precision f32, dimensions ([...]): 1 38 38 255
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_543:0' precision f32, dimensions ([...]): 1 19 19 255
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_544:0' precision f32, dimensions ([...]): 1 76 76 255
[Step 7/11] Loading the model to the device
[ INFO ] Compile model took 10256.36 ms
[Step 8/11] Querying optimal runtime parameters
[ INFO ] DEVICE: GPU.0
[ INFO ] AVAILABLE_DEVICES , ['0', '1']
[ INFO ] RANGE_FOR_ASYNC_INFER_REQUESTS , (1, 2, 1)
[ INFO ] RANGE_FOR_STREAMS , (1, 2)
[ INFO ] OPTIMAL_BATCH_SIZE , 1
[ INFO ] MAX_BATCH_SIZE , 1
[ INFO ] FULL_DEVICE_NAME , Intel(R) Graphics [0x56c0] (dGPU)
[ INFO ] DEVICE_TYPE , Type.DISCRETE
[ INFO ] OPTIMIZATION_CAPABILITIES , ['FP32', 'BIN', 'FP16', 'INT8', 'GPU_HW_MATMUL']
[ INFO ] GPU_UARCH_VERSION , 12.7.1
[ INFO ] GPU_EXECUTION_UNITS_COUNT , 512
[ INFO ] PERF_COUNT , False
[ INFO ] MODEL_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_HOST_TASK_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_QUEUE_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_QUEUE_THROTTLE , Priority.MEDIUM
[ INFO ] GPU_ENABLE_LOOP_UNROLLING , True
[ INFO ] CACHE_DIR ,
[ INFO ] PERFORMANCE_HINT , PerformanceMode.UNDEFINED
[ INFO ] COMPILATION_NUM_THREADS , 152
[ INFO ] NUM_STREAMS , 8
[ INFO ] PERFORMANCE_HINT_NUM_REQUESTS , 0
[ INFO ] DEVICE_ID , 0
[Step 9/11] Creating infer requests and preparing input data
[ INFO ] Create 16 infer requests took 137.83 ms
[ WARNING ] No input files were given for input 'image_input'!. This input will be filled with random values!
[ INFO ] Fill input 'image_input' with random values
[Step 10/11] Measuring performance (Start inference asynchronously, 16 inference requests using 8 streams for GPU.0, inference only: True, limits: 60000 ms duration)
[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).
[ INFO ] First inference took 28.36 ms
[Step 11/11] Dumping statistics report
Count: 15136 iterations
Duration: 60079.11 ms
Latency:
Median: 63.41 ms
AVG: 63.44 ms
MIN: 10.82 ms
MAX: 93.73 ms
Throughput: 251.93 FPS
```
# Monitor GPU
```
smc@atsm150:~$ /opt/xpum/bin/xpumcli dump -m 0,1,2,22 -d 1
Timestamp, DeviceId, GPU Utilization (%), GPU Power (W), GPU Frequency (MHz), Compute Engine 0 (%), Compute Engine 1 (%), Compute Engine 2 (%), Compute Engine 3 (%)
2022-10-12T08:33:28.000Z, 1, 100.00, 114.78, 1800, 87.45, 70.35, 70.19, 64.75
2022-10-12T08:33:29.000Z, 1, 100.00, 119.64, 1950, 99.41, 99.26, 99.21, 99.33
2022-10-12T08:33:30.000Z, 1, 100.00, 121.76, 1700, 99.09, 99.23, 99.22, 99.07
2022-10-12T08:33:31.000Z, 1, 100.00, 119.62, 1800, 99.18, 98.94, 98.88, 98.91
2022-10-12T08:33:32.000Z, 1, 100.00, 123.70, 1950, 99.27, 99.29, 99.10, 99.20
```
### Check OpenCL runtime versions
root@0567446f5a0b:~# apt list | grep opencl
WARNING: apt does not have a stable CLI interface. Use with caution in scripts.
intel-opencl-icd/now 22.18.023111-dgfx10360 amd64 [installed,local]
mesa-opencl-icd/now 22.2.0.20220506.0-dgfx10360 amd64 [installed,local]
ocl-icd-libopencl1/now 2.2.11-1ubuntu1 amd64 [installed,local]
root@0567446f5a0b:~# apt list | grep igc
WARNING: apt does not have a stable CLI interface. Use with caution in scripts.
libigc-dev/now 1:1.0-dgfx10360+1 amd64 [installed,local]
libigc-tools/now 1:1.0-dgfx10360+1 amd64 [installed,local]
libigc1/now 1:1.0-dgfx10360+1 amd64 [installed,local]
root@0567446f5a0b:~# apt list | grep igd
WARNING: apt does not have a stable CLI interface. Use with caution in scripts.
libigdfcl-dev/now 1:1.0-dgfx10360+1 amd64 [installed,local]
libigdfcl1/now 1:1.0-dgfx10360+1 amd64 [installed,local]
libigdgmm-dev/now 1:22.1-dgfx10360+1 amd64 [installed,local]
libigdgmm12/now 1:22.1-dgfx10360+1 amd64 [installed,local]
## openvino/ubuntu20_dev:latest (2022.2)
### docker run
sudo docker run --group-add=109 --user root -it --rm --device=/dev/dri/ --cap-add SYS_ADMIN -v /dev/dri/by-path:/dev/dri/by-path -v /intel/3_Media_Analytics/content/:/intel/3_Media_Analytics/content -v /home/smc/Downloads:/mnt openvino/ubuntu20_dev:latest
### Download and install OpenCL Runtime
https://github.com/intel/compute-runtime/releases/tag/22.39.24347
mkdir neo
cd neo
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.12149.1/intel-igc-core_1.0.12149.1_amd64.deb
wget https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.12149.1/intel-igc-opencl_1.0.12149.1_amd64.deb
wget https://github.com/intel/compute-runtime/releases/download/22.39.24347/intel-level-zero-gpu-dbgsym_1.3.24347_amd64.ddeb
wget https://github.com/intel/compute-runtime/releases/download/22.39.24347/intel-level-zero-gpu_1.3.24347_amd64.deb
wget https://github.com/intel/compute-runtime/releases/download/22.39.24347/intel-opencl-icd-dbgsym_22.39.24347_amd64.ddeb
wget https://github.com/intel/compute-runtime/releases/download/22.39.24347/intel-opencl-icd_22.39.24347_amd64.deb
wget https://github.com/intel/compute-runtime/releases/download/22.39.24347/libigdgmm12_22.2.0_amd64.deb
dpkg -i *.deb
YOLO v4 608x608 FP16-INT8
b=1,
Latency:
Median: 31.21 ms
AVG: 31.21 ms
MIN: 7.04 ms
MAX: 38.62 ms
Throughput: 255.90 FPS
b=64,
Latency:
Median: 1145.11 ms
AVG: 1139.17 ms
MIN: 458.45 ms
MAX: 1254.27 ms
Throughput: 446.58 FPS
# batch = 1, YOLO v4 INT8 608x608
```=bash
root@74aa7492319a:~/neo# benchmark_app -m /intel/3_Media_Analytics/content/openvino_models/public/yolo-v4-tf/FP16-INT8/yolo-v4-tf.xml -d GPU.0 -b 1 -nstream 4 -hint none
[Step 1/11] Parsing and validating input arguments
[Step 2/11] Loading OpenVINO
[ WARNING ] No device GPU.0 performance hint is set.
[ INFO ] OpenVINO:
API version............. 2022.2.0-7713-af16ea1d79a-releases/2022/2
[ INFO ] Device info
GPU
Intel GPU plugin........ version 2022.2
Build................... 2022.2.0-7713-af16ea1d79a-releases/2022/2
[Step 3/11] Setting device configuration
[Step 4/11] Reading network files
[ INFO ] Read model took 89.50 ms
[Step 5/11] Resizing network to match image sizes and given batch
[ INFO ] Network batch size: 1
[Step 6/11] Configuring input of the model
[ INFO ] Model input 'image_input' precision u8, dimensions ([N,H,W,C]): 1 608 608 3
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_542:0' precision f32, dimensions ([...]): 1 38 38 255
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_543:0' precision f32, dimensions ([...]): 1 19 19 255
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_544:0' precision f32, dimensions ([...]): 1 76 76 255
[Step 7/11] Loading the model to the device
[ INFO ] Compile model took 9700.59 ms
[Step 8/11] Querying optimal runtime parameters
[ INFO ] DEVICE: GPU.0
[ INFO ] AVAILABLE_DEVICES , ['0', '1']
[ INFO ] RANGE_FOR_ASYNC_INFER_REQUESTS , (1, 2, 1)
[ INFO ] RANGE_FOR_STREAMS , (1, 2)
[ INFO ] OPTIMAL_BATCH_SIZE , 1
[ INFO ] MAX_BATCH_SIZE , 1
[ INFO ] FULL_DEVICE_NAME , Intel(R) Graphics [0x56c0] (dGPU)
[ INFO ] DEVICE_TYPE , Type.DISCRETE
[ INFO ] OPTIMIZATION_CAPABILITIES , ['FP32', 'BIN', 'FP16', 'INT8', 'GPU_HW_MATMUL']
[ INFO ] GPU_UARCH_VERSION , 12.7.1
[ INFO ] GPU_EXECUTION_UNITS_COUNT , 512
[ INFO ] PERF_COUNT , False
[ INFO ] MODEL_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_HOST_TASK_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_QUEUE_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_QUEUE_THROTTLE , Priority.MEDIUM
[ INFO ] GPU_ENABLE_LOOP_UNROLLING , True
[ INFO ] CACHE_DIR ,
[ INFO ] PERFORMANCE_HINT , PerformanceMode.UNDEFINED
[ INFO ] COMPILATION_NUM_THREADS , 152
[ INFO ] NUM_STREAMS , 4
[ INFO ] PERFORMANCE_HINT_NUM_REQUESTS , 0
[ INFO ] DEVICE_ID , 0
[Step 9/11] Creating infer requests and preparing input data
[ INFO ] Create 8 infer requests took 57.54 ms
[ WARNING ] No input files were given for input 'image_input'!. This input will be filled with random values!
[ INFO ] Fill input 'image_input' with random values
[Step 10/11] Measuring performance (Start inference asynchronously, 8 inference requests using 4 streams for GPU.0, inference only: True, limits: 60000 ms duration)
[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).
[ INFO ] First inference took 18.75 ms
[Step 11/11] Dumping statistics report
Count: 15368 iterations
Duration: 60055.04 ms
Latency:
Median: 31.21 ms
AVG: 31.21 ms
MIN: 7.04 ms
MAX: 38.62 ms
Throughput: 255.90 FPS
```
# batch = 64, YOLO v4 INT8 608x608
```=bash
root@74aa7492319a:~/neo# benchmark_app -m /intel/3_Media_Analytics/content/openv ino_models/public/yolo-v4-tf/FP16-INT8/yolo-v4-tf.xml -d GPU.0 -b 64 -nstream 4 -hint none
[Step 1/11] Parsing and validating input arguments
[Step 2/11] Loading OpenVINO
[ WARNING ] No device GPU.0 performance hint is set.
[ INFO ] OpenVINO:
API version............. 2022.2.0-7713-af16ea1d79a-releases/2022/2
[ INFO ] Device info
GPU
Intel GPU plugin........ version 2022.2
Build................... 2022.2.0-7713-af16ea1d79a-releases/2022/2
[Step 3/11] Setting device configuration
[Step 4/11] Reading network files
[ INFO ] Read model took 95.51 ms
[Step 5/11] Resizing network to match image sizes and given batch
[ INFO ] Reshaping model: 'image_input': {64,608,608,3}
[ INFO ] Reshape model took 12.20 ms
[ INFO ] Network batch size: 64
[Step 6/11] Configuring input of the model
[ INFO ] Model input 'image_input' precision u8, dimensions ([N,H,W,C]): 64 608 608 3
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_542:0' precision f32 , dimensions ([...]): 64 38 38 255
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_543:0' precision f32 , dimensions ([...]): 64 19 19 255
[ INFO ] Model output 'Func/StatefulPartitionedCall/output/_544:0' precision f32 , dimensions ([...]): 64 76 76 255
[Step 7/11] Loading the model to the device
[ INFO ] Compile model took 15494.07 ms
[Step 8/11] Querying optimal runtime parameters
[ INFO ] DEVICE: GPU.0
[ INFO ] AVAILABLE_DEVICES , ['0', '1']
[ INFO ] RANGE_FOR_ASYNC_INFER_REQUESTS , (1, 2, 1)
[ INFO ] RANGE_FOR_STREAMS , (1, 2)
[ INFO ] OPTIMAL_BATCH_SIZE , 1
[ INFO ] MAX_BATCH_SIZE , 1
[ INFO ] FULL_DEVICE_NAME , Intel(R) Graphics [0x56c0] (dGPU)
[ INFO ] DEVICE_TYPE , Type.DISCRETE
[ INFO ] OPTIMIZATION_CAPABILITIES , ['FP32', 'BIN', 'FP16', 'INT8', 'GPU_HW_ MATMUL']
[ INFO ] GPU_UARCH_VERSION , 12.7.1
[ INFO ] GPU_EXECUTION_UNITS_COUNT , 512
[ INFO ] PERF_COUNT , False
[ INFO ] MODEL_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_HOST_TASK_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_QUEUE_PRIORITY , Priority.MEDIUM
[ INFO ] GPU_QUEUE_THROTTLE , Priority.MEDIUM
[ INFO ] GPU_ENABLE_LOOP_UNROLLING , True
[ INFO ] CACHE_DIR ,
[ INFO ] PERFORMANCE_HINT , PerformanceMode.UNDEFINED
[ INFO ] COMPILATION_NUM_THREADS , 152
[ INFO ] NUM_STREAMS , 4
[ INFO ] PERFORMANCE_HINT_NUM_REQUESTS , 0
[ INFO ] DEVICE_ID , 0
[Step 9/11] Creating infer requests and preparing input data
[ INFO ] Create 8 infer requests took 4045.65 ms
[ WARNING ] No input files were given for input 'image_input'!. This input will be filled with random values!
[ INFO ] Fill input 'image_input' with random values
[Step 10/11] Measuring performance (Start inference asynchronously, 8 inference requests using 4 streams for GPU.0, inference only: True, limits: 60000 ms durat ion)
[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).
[ INFO ] First inference took 177.90 ms
[Step 11/11] Dumping statistics report
Count: 432 iterations
Duration: 61910.41 ms
Latency:
Median: 1145.11 ms
AVG: 1139.17 ms
MIN: 458.45 ms
MAX: 1254.27 ms
Throughput: 446.58 FPS
```