# AI PC
## 1. 使用 Intel AI Playground 在 AIPC 執行 Stable Diffusion
### 下載並安裝 AI Playground
https://github.com/intel/AI-Playground/releases/download/v2.0.0a-preview/AI.Playground-v2.0.0-alpha-prev.exe
### 預設下載並使用的Stable Diffusion 模型
lykon/dreamshaper-8 is a Stable Diffusion model that has been fine-tuned on runwayml/stable-diffusion-v1-5. (6.64GB)
### 使用其他Stable Diffusion 模型的方法
#### https://www.youtube.com/watch?v=1FXrk9Xcx2g
#### https://github.com/intel/ai-playground/blob/main/AI%20Playground%20Users%20Guide.pdf
### Intel AI Playground on Core Ultra 7 165H 執行畫面

### AI-Playground-v2.0 alpha preview 的 Github
https://github.com/intel/AI-Playground/releases/tag/v2.0.0a-preview
## 2. 使用 OpenVINO GenAI 在 AIPC 執行 Whisper Models
必須要安裝
* Python
* GIT
* FFMPEG
* OpenVINO GenAI Python Package
* OpenAI whisper models
以下是安裝的步驟,
### 下載並安裝Python到Windows 11
https://www.python.org/ftp/python/3.11.9/python-3.11.9-amd64.exe
#### Step 1:

#### Step 2:

#### Step 3:

#### Step 4:

### 下載並安裝 GIT
https://github.com/git-for-windows/git/releases/download/v2.47.1.windows.1/Git-2.47.1-64-bit.exe
















### 下載並安裝 FFMPEG
https://github.com/GyanD/codexffmpeg/releases/download/7.0.2/ffmpeg-7.0.2-full_build-shared.7z
```=bat
set PATH=%PATH%;%USERPROFILE%\aipc\ffmpeg-7.0.2-full_build-shared\bin
```
### 安裝OpenVINO GenAI使用 Windows Command Prompt
```=bat
mkdir aipc
cd aipc
python.exe -m venv ov_genai_whisper
ov_genai_whisper\Scripts\activate
python -m pip install --upgrade pip
pip install openvino-genai
pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git
pip install librosa
pip install gradio
```

### 下載 Mono 16KHz 取樣平率的 wav 檔案
https://github.com/ggerganov/whisper.cpp/blob/master/samples/jfk.wav
### 下載並轉換 Whisper models 到 OpenVINO 格式
```=bat
#Download and convert to OpenVINO whisper-base model
optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base
或者是
optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo whisper-large-v3-turbo
[模型壓縮]
#Download, convert and apply int8 static quantization to whisper-base model
optimum-cli export openvino --trust-remote-code --model openai/whisper-base \
--quant-mode int8 --dataset librispeech --num-samples 32 whisper-base-int8
或者是
optimum-cli export openvino --trust-remote-code --model openai/whisper-large-v3-turbo \
--quant-mode int8 --dataset librispeech --num-samples 32 whisper-large-v3-turbo-int8
```
### ov_genai_whisper_gradio.pyon Core Ultra 7 165H 執行畫面
### 執行方法
```=bat
ov_genai_whisper\Scripts\activate
python ov_genai_whisper_gradio.py
```


### whisper-large-v3-turbo takes 91.36 second to infer 348.4154375 second audio!!
### ov_genai_whisper_gradio.pyon Core Ultra 7 268V 執行畫面
### whisper-large-v3-turbo takes 19.84 second to infer 348.4154375 second audio!!

```=bat
set PATH=%PATH%;%USERPROFILE%\aipc\ffmpeg-7.0.2-full_build-shared\bin
python ov_genai_whisper_gradio.py
```

### ov_genai_whisper_gradio.py
```=python
import time
#import librosa
import openvino_genai
import gradio as gr
from pathlib import Path
from transformers.pipelines.audio_utils import ffmpeg_read
import math
def format_timestamp(seconds: float):
"""
format time in srt-file expected format
"""
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
return (f"{hours}:" if hours > 0 else "00:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def prepare_srt(transcription, filter_duration=None):
"""
Format transcription into srt file format
"""
segment_lines = []
for idx, segment in enumerate(transcription):
timestamp = [segment.start_ts, segment.end_ts]
# for the case where the model could not predict an ending timestamp, which can happen if audio is cut off in the middle of a word.
if segment.end_ts == -1:
timestamp[1] = filter_duration
if filter_duration is not None and (timestamp[0] >= math.floor(filter_duration) or timestamp[1] > math.ceil(filter_duration) + 1):
break
segment_lines.append(str(idx + 1) + "\n")
time_start = format_timestamp(timestamp[0])
time_end = format_timestamp(timestamp[1])
time_str = f"{time_start} --> {time_end}\n"
segment_lines.append(time_str)
segment_lines.append(segment.text + "\n\n")
return segment_lines
def read_wav(filepath):
with open(filepath, "rb") as f:
inputs = f.read()
sampling_rate = 16000
raw_speech = ffmpeg_read(inputs, sampling_rate)
#raw_speech, sampling_rate = librosa.load(filepath, sr=16000)
print(f'Duration of wave file : {len(raw_speech)/sampling_rate} second')
return raw_speech, sampling_rate
is_model_loaded = False
ov_device = 'GPU'
IR_model_path = './whisper-base'
def transcribe(model_path, XPU, wav_file):
global transcriber
global is_model_loaded
global ov_device
global IR_model_path
if is_model_loaded == False or IR_model_path != model_path or ov_device != XPU:
IR_model_path = model_path
ov_device = XPU
print(f'Loading {IR_model_path} to {ov_device}')
start = time.time()
transcriber = openvino_genai.WhisperPipeline(IR_model_path, ov_device)
end = time.time()
print(f'Model loading time {end - start}')
is_model_loaded = True
raw_speech, sampling_rate = read_wav(wav_file)
def_config = transcriber.get_generation_config()
frame_num = len(raw_speech) / 16000
if frame_num > 30:
config = transcriber.get_generation_config()
chink_num = math.ceil(frame_num / 30)
config.max_length = chink_num * def_config.max_length
transcriber.set_generation_config(config)
print(f'Run Inference ...')
start = time.time()
result = transcriber.generate(raw_speech, task="transcribe", return_timestamps=True).chunks
end = time.time()
print(f'Duration of wave file : {len(raw_speech)/sampling_rate} second, Inference time {end - start} second')
srt_lines = prepare_srt(result, filter_duration=len(raw_speech)/sampling_rate)
output_file = Path(wav_file)
print(output_file)
with output_file.with_suffix(".srt").open("w",encoding='utf-8') as f:
f.writelines(srt_lines)
return srt_lines #result
def main():
demo = gr.Interface(
transcribe,
inputs=[gr.Textbox(value='./whisper-base'), gr.Radio(["CPU", "GPU", "NPU"], value="GPU"), gr.File(label="Wav file in mono/16KHz", file_types=[".wav", ".flac", ".mp3", ".m4a"], value=str(Path("jfk.wav")), scale=1)],
outputs=["text"],
title="OpenAI Whipser Models on AIPC XPU",
description="OpenAI Whisper Speech to Text with OpenVINO",
)
demo.launch()
if __name__ == "__main__":
main()
```
### download_yt_audio.py
pip install yt_dlp
```=python
import yt_dlp
def download(video_id: str) -> str:
video_url = f'https://www.youtube.com/watch?v={video_id}'
ydl_opts = {
'format': 'm4a/bestaudio/best',
'paths': {'home': 'audio/'},
'outtmpl': {'default': '%(id)s.%(ext)s'},
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'm4a',
}]
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
error_code = ydl.download([video_url])
if error_code != 0:
raise Exception('Failed to download video')
return f'audio/{video_id}.m4a'
file_path = download('XVKCsBcV3EY')
```
### download_yt_video.py
```
import yt_dlp
def download(video_id: str) -> str:
video_url = f'https://www.youtube.com/watch?v={video_id}'
# video
ydl_opts = {
"format": "best[ext=mp4]",
"paths": {'home': 'video/'},
"outtmpl": {'default': '%(id)s.%(ext)s'},
}
# audio
#ydl_opts = {
# 'format': 'm4a/bestaudio/best',
# 'paths': {'home': 'audio/'},
# 'outtmpl': {'default': '%(id)s.%(ext)s'},
# 'postprocessors': [{
# 'key': 'FFmpegExtractAudio',
# 'preferredcodec': 'm4a',
# }]
#}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
error_code = ydl.download([video_url])
if error_code != 0:
raise Exception('Failed to download video')
return f'video/{video_id}.mp4'
#file_path = download('XVKCsBcV3EY')
#file_path = download('nVdrtn20zdM')
file_path = download('wJmWa8-j0ao')
```
## 整合下載與轉錄的介面
```=python
import gradio as gr
import yt_dlp
import re
from pathlib import Path
import time
import openvino_genai
from transformers.pipelines.audio_utils import ffmpeg_read
import math
import os
def format_timestamp(seconds: float):
"""
format time in srt-file expected format
"""
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
return (f"{hours}:" if hours > 0 else "00:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def prepare_srt(transcription, filter_duration=None):
"""
Format transcription into srt file format
"""
segment_lines = []
for idx, segment in enumerate(transcription):
timestamp = [segment.start_ts, segment.end_ts]
# for the case where the model could not predict an ending timestamp, which can happen if audio is cut off in the middle of a word.
if segment.end_ts == -1:
timestamp[1] = filter_duration
if filter_duration is not None and (timestamp[0] >= math.floor(filter_duration) or timestamp[1] > math.ceil(filter_duration) + 1):
break
segment_lines.append(str(idx + 1) + "\n")
time_start = format_timestamp(timestamp[0])
time_end = format_timestamp(timestamp[1])
time_str = f"{time_start} --> {time_end}\n"
segment_lines.append(time_str)
segment_lines.append(segment.text + "\n\n")
return segment_lines
def read_wav(filepath):
with open(filepath, "rb") as f:
inputs = f.read()
if os.path.exists(filepath):
with open(filepath, "rb") as f:
inputs = f.read()
else:
print(f"錯誤:檔案 {filepath} 不存在")
sampling_rate = 16000
raw_speech = ffmpeg_read(inputs, sampling_rate)
#raw_speech, sampling_rate = librosa.load(filepath, sr=16000)
print(f'Duration of wave file : {len(raw_speech)/sampling_rate} second')
return raw_speech, sampling_rate
is_model_loaded = False
ov_device = 'GPU'
IR_model_path = './whisper-large-v3-turbo'
def transcribe(model_path, XPU, wav_file):
global transcriber
global is_model_loaded
global ov_device
global IR_model_path
if is_model_loaded == False or IR_model_path != model_path or ov_device != XPU:
IR_model_path = model_path
ov_device = XPU
print(f'Loading {IR_model_path} to {ov_device}')
start = time.time()
transcriber = openvino_genai.WhisperPipeline(IR_model_path, ov_device)
end = time.time()
print(f'Model loading time {end - start}')
is_model_loaded = True
raw_speech, sampling_rate = read_wav(wav_file)
def_config = transcriber.get_generation_config()
frame_num = len(raw_speech) / 16000
if frame_num > 30:
config = transcriber.get_generation_config()
chink_num = math.ceil(frame_num / 30)
config.max_length = chink_num * def_config.max_length
transcriber.set_generation_config(config)
print(f'Run Inference ...')
start = time.time()
result = transcriber.generate(raw_speech, task="transcribe", return_timestamps=True).chunks
end = time.time()
print(f'Duration of wave file : {len(raw_speech)/sampling_rate} second, Inference time {end - start} second')
srt_lines = prepare_srt(result, filter_duration=len(raw_speech)/sampling_rate)
output_file = Path(wav_file)
print(output_file)
with output_file.with_suffix(".srt").open("w",encoding='utf-8') as f:
f.writelines(srt_lines)
return srt_lines #result
def extract_video_id(url):
# 使用正則表達式從YouTube網址中提取影片ID
match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
if match:
return match.group(1) # 返回匹配到的影片ID
else:
return None # 如果URL格式不正確,返回None
# 下載音訊的函數
def download_audio(url):
try:
ydl_opts = {
'format': 'm4a/bestaudio/best',
'paths': {'home': 'audio/'},
'outtmpl': {'default': '%(id)s.%(ext)s'},
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'm4a',
}]
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
error_code = ydl.download([url])
if error_code != 0:
raise Exception('Failed to download video')
video_id = extract_video_id(url)
return f'audio/{video_id}.m4a'
except Exception as e:
return f"下載錯誤: {str(e)}"
# 使用Gradio創建簡單的Web介面
def gradio_interface():
with gr.Blocks() as demo:
gr.Markdown("# YouTube音頻下載器")
# 這是YouTube音頻下載區塊
with gr.Tab("YouTube 音頻下載"):
url_input = gr.Textbox(label="請輸入YouTube影片網址", placeholder="例如: https://www.youtube.com/watch?v=nwYHurRo4e0")
download_button = gr.Button("下載音訊")
output = gr.File(label="下載的音訊檔案")
# 連接功能
download_button.click(download_audio, inputs=[url_input], outputs=[output])
gr.Markdown("# 語音轉錄")
# 這是語音轉錄區塊
with gr.Tab("OpenVINO 語音轉錄"):
model_input = gr.Textbox(value='./whisper-large-v3-turbo', label="模型路徑")
device_input = gr.Radio(["CPU", "GPU", "NPU"], value="GPU", label="選擇設備")
audio_input = gr.File(label="Wav 文件(單聲道/16KHz)", file_types=[".wav", ".flac", ".mp3", ".m4a"], value=str(Path("jfk.wav")), scale=1)
transcribe_button = gr.Button("開始轉錄")
transcribe_output = gr.Textbox(label="轉錄結果")
# 連接功能
transcribe_button.click(transcribe, inputs=[model_input, device_input, audio_input], outputs=[transcribe_output])
demo.launch()
if __name__ == "__main__":
gradio_interface()
```