# Patch to custom_ai_assistant/app.py
### June/26
'''
python convert_and_optimize_asr.py
python convert_and_optimize_asr.py --quantize_weights int8
python convert_and_optimize_chat.py --chat_model_type TinyLlama-1.1B --quantize_weights int8
python convert_and_optimize_chat.py --chat_model_type TinyLlama-1.1B --quantize_weights int4
python app.py --asr_model_dir model/distil-whisper-large-v2-INT8 --chat_model_dir model/TinyLlama-1.1B-INT4
'''

https://github.com/openvinotoolkit/openvino_notebooks/blob/recipes/recipes/custom_ai_assistant/app.py
```
diff --git a/recipes/custom_ai_assistant/app.py b/recipes/custom_ai_assistant/app.py
index 7b657eea..e8722840 100644
--- a/recipes/custom_ai_assistant/app.py
+++ b/recipes/custom_ai_assistant/app.py
@@ -6,6 +6,8 @@ from typing import Tuple, List, Optional
import gradio as gr
import librosa
+import io
+from scipy.io import wavfile
import numpy as np
from optimum.intel import OVModelForCausalLM, OVModelForSpeechSeq2Seq
from transformers import AutoConfig, AutoTokenizer, AutoProcessor, PreTrainedTokenizer, TextIteratorStreamer
@@ -69,6 +71,27 @@ asr_model: Optional[OVModelForSpeechSeq2Seq] = None
asr_processor: Optional[AutoProcessor] = None
+def resample(audio, src_sample_rate, dst_sample_rate):
+ """
+ Resample audio to specific sample rate
+
+ Parameters:
+ audio: input audio signal
+ src_sample_rate: source audio sample rate
+ dst_sample_rate: destination audio sample rate
+ Returns:
+ resampled_audio: input audio signal resampled with dst_sample_rate
+ """
+ if src_sample_rate == dst_sample_rate:
+ return audio
+ duration = audio.shape[0] / src_sample_rate
+ resampled_data = np.zeros(shape=(int(duration * dst_sample_rate)), dtype=np.float32)
+ x_old = np.linspace(0, duration, audio.shape[0], dtype=np.float32)
+ x_new = np.linspace(0, duration, resampled_data.shape[0], dtype=np.float32)
+ resampled_audio = np.interp(x_new, x_old, audio)
+ return duration, resampled_audio.astype(np.float32)
+
+
def load_asr_model(model_dir: Path) -> None:
"""
Load automatic speech recognition model and assign it to a global variable
@@ -79,7 +102,7 @@ def load_asr_model(model_dir: Path) -> None:
global asr_model, asr_processor
# create a distil-whisper model and its processor
- asr_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device="AUTO")
+ asr_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device="GPU")
asr_processor = AutoProcessor.from_pretrained(model_dir)
@@ -93,8 +116,9 @@ def load_chat_model(model_dir: Path) -> None:
global chat_model, chat_tokenizer, message_template
# load llama model and its tokenizer
- ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1', "CACHE_DIR": ""}
- chat_model = OVModelForCausalLM.from_pretrained(model_dir, device="AUTO", config=AutoConfig.from_pretrained(model_dir), ov_config=ov_config)
+ #ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1', "CACHE_DIR": ""}
+ ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1'}
+ chat_model = OVModelForCausalLM.from_pretrained(model_dir, device="GPU", config=AutoConfig.from_pretrained(model_dir), ov_config=ov_config)
chat_tokenizer = AutoTokenizer.from_pretrained(model_dir)
# neural chat requires different template than specified in the tokenizer
message_template = NEURAL_CHAT_MODEL_TEMPLATE if ("neural-chat" in model_dir.name) else chat_tokenizer.default_chat_template
@@ -203,7 +227,9 @@ def transcribe(audio: Tuple[int, np.ndarray], conversation: List[List[str]]) ->
sample_rate, audio = audio
# the whisper model requires 16000Hz, not 44100Hz
- audio = librosa.resample(audio.astype(np.float32), orig_sr=sample_rate, target_sr=AUDIO_WIDGET_SAMPLE_RATE).astype(np.int16)
+# audio = librosa.resample(audio.astype(np.float32), orig_sr=sample_rate, target_sr=AUDIO_WIDGET_SAMPLE_RATE).astype(np.int16)
+ audio = audio.mean(axis=1)
+ duration, audio = resample(audio, sample_rate, AUDIO_WIDGET_SAMPLE_RATE)
# get input features from the audio
input_features = asr_processor(audio, sampling_rate=AUDIO_WIDGET_SAMPLE_RATE, return_tensors="pt").input_features
@@ -311,7 +337,7 @@ def run(asr_model_dir: Path, chat_model_dir: Path, public_interface: bool = Fals
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--asr_model_dir', type=str, default="model/distil-large-v2-FP16", help="Path to the automatic speech recognition model directory")
- parser.add_argument('--chat_model_dir', type=str, default="model/llama2-7B-INT8", help="Path to the chat model directory")
+ parser.add_argument('--chat_model_dir', type=str, default="model/TinyLlama-1.1B-chat-INT4", help="Path to the chat model directory")
parser.add_argument('--public_interface', default=False, action="store_true", help="Whether interface should be available publicly")
args = parser.parse_args()
```
With –asr_model_dir "model/distil-large-v2-INT8" and –chat_model_dir model/TinyLlama-1.1B-chat-INT4 + model_cache will use less than 7GB

With –asr_model_dir "model/distil-large-v2-FP16" and –chat_model_dir model/TinyLlama-1.1B-chat-INT4 + model_cache will use 7.xGB

### With --asr_model_dir "model/distil-large-v2-FP16" and --chat_model_dir model/TinyLlama-1.1B-chat-INT4 will use 7.xGB
