Patch to - HackMD

# Patch to custom_ai_assistant/app.py ### June/26 ''' python convert_and_optimize_asr.py python convert_and_optimize_asr.py --quantize_weights int8 python convert_and_optimize_chat.py --chat_model_type TinyLlama-1.1B --quantize_weights int8 python convert_and_optimize_chat.py --chat_model_type TinyLlama-1.1B --quantize_weights int4 python app.py --asr_model_dir model/distil-whisper-large-v2-INT8 --chat_model_dir model/TinyLlama-1.1B-INT4 ''' ![image](https://hackmd.io/_uploads/BkvW8NtU0.png) https://github.com/openvinotoolkit/openvino_notebooks/blob/recipes/recipes/custom_ai_assistant/app.py ``` diff --git a/recipes/custom_ai_assistant/app.py b/recipes/custom_ai_assistant/app.py index 7b657eea..e8722840 100644 --- a/recipes/custom_ai_assistant/app.py +++ b/recipes/custom_ai_assistant/app.py @@ -6,6 +6,8 @@ from typing import Tuple, List, Optional import gradio as gr import librosa +import io +from scipy.io import wavfile import numpy as np from optimum.intel import OVModelForCausalLM, OVModelForSpeechSeq2Seq from transformers import AutoConfig, AutoTokenizer, AutoProcessor, PreTrainedTokenizer, TextIteratorStreamer @@ -69,6 +71,27 @@ asr_model: Optional[OVModelForSpeechSeq2Seq] = None asr_processor: Optional[AutoProcessor] = None +def resample(audio, src_sample_rate, dst_sample_rate): + """ + Resample audio to specific sample rate + + Parameters: + audio: input audio signal + src_sample_rate: source audio sample rate + dst_sample_rate: destination audio sample rate + Returns: + resampled_audio: input audio signal resampled with dst_sample_rate + """ + if src_sample_rate == dst_sample_rate: + return audio + duration = audio.shape[0] / src_sample_rate + resampled_data = np.zeros(shape=(int(duration * dst_sample_rate)), dtype=np.float32) + x_old = np.linspace(0, duration, audio.shape[0], dtype=np.float32) + x_new = np.linspace(0, duration, resampled_data.shape[0], dtype=np.float32) + resampled_audio = np.interp(x_new, x_old, audio) + return duration, resampled_audio.astype(np.float32) + + def load_asr_model(model_dir: Path) -> None: """ Load automatic speech recognition model and assign it to a global variable @@ -79,7 +102,7 @@ def load_asr_model(model_dir: Path) -> None: global asr_model, asr_processor # create a distil-whisper model and its processor - asr_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device="AUTO") + asr_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device="GPU") asr_processor = AutoProcessor.from_pretrained(model_dir) @@ -93,8 +116,9 @@ def load_chat_model(model_dir: Path) -> None: global chat_model, chat_tokenizer, message_template # load llama model and its tokenizer - ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1', "CACHE_DIR": ""} - chat_model = OVModelForCausalLM.from_pretrained(model_dir, device="AUTO", config=AutoConfig.from_pretrained(model_dir), ov_config=ov_config) + #ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1', "CACHE_DIR": ""} + ov_config = {'PERFORMANCE_HINT': 'LATENCY', 'NUM_STREAMS': '1'} + chat_model = OVModelForCausalLM.from_pretrained(model_dir, device="GPU", config=AutoConfig.from_pretrained(model_dir), ov_config=ov_config) chat_tokenizer = AutoTokenizer.from_pretrained(model_dir) # neural chat requires different template than specified in the tokenizer message_template = NEURAL_CHAT_MODEL_TEMPLATE if ("neural-chat" in model_dir.name) else chat_tokenizer.default_chat_template @@ -203,7 +227,9 @@ def transcribe(audio: Tuple[int, np.ndarray], conversation: List[List[str]]) -> sample_rate, audio = audio # the whisper model requires 16000Hz, not 44100Hz - audio = librosa.resample(audio.astype(np.float32), orig_sr=sample_rate, target_sr=AUDIO_WIDGET_SAMPLE_RATE).astype(np.int16) +# audio = librosa.resample(audio.astype(np.float32), orig_sr=sample_rate, target_sr=AUDIO_WIDGET_SAMPLE_RATE).astype(np.int16) + audio = audio.mean(axis=1) + duration, audio = resample(audio, sample_rate, AUDIO_WIDGET_SAMPLE_RATE) # get input features from the audio input_features = asr_processor(audio, sampling_rate=AUDIO_WIDGET_SAMPLE_RATE, return_tensors="pt").input_features @@ -311,7 +337,7 @@ def run(asr_model_dir: Path, chat_model_dir: Path, public_interface: bool = Fals if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--asr_model_dir', type=str, default="model/distil-large-v2-FP16", help="Path to the automatic speech recognition model directory") - parser.add_argument('--chat_model_dir', type=str, default="model/llama2-7B-INT8", help="Path to the chat model directory") + parser.add_argument('--chat_model_dir', type=str, default="model/TinyLlama-1.1B-chat-INT4", help="Path to the chat model directory") parser.add_argument('--public_interface', default=False, action="store_true", help="Whether interface should be available publicly") args = parser.parse_args() ``` With –asr_model_dir "model/distil-large-v2-INT8" and –chat_model_dir model/TinyLlama-1.1B-chat-INT4 + model_cache will use less than 7GB ![Screenshot from 2024-05-09 22-01-04](https://hackmd.io/_uploads/r1TLPL9zA.png) With –asr_model_dir "model/distil-large-v2-FP16" and –chat_model_dir model/TinyLlama-1.1B-chat-INT4 + model_cache will use 7.xGB ![Screenshot from 2024-05-09 21-54-14](https://hackmd.io/_uploads/r16eI85G0.png) ### With --asr_model_dir "model/distil-large-v2-FP16" and --chat_model_dir model/TinyLlama-1.1B-chat-INT4 will use 7.xGB ![Screenshot from 2024-05-06 15-34-40](https://hackmd.io/_uploads/r1B4d-8z0.png)