Voice Identifier

# Voice Identifier ``` import numpy as np import sklearn from sklearn.mixture import GaussianMixture def extract_features(audio_file): # Load the audio file (rate,sig) = psf.wavfile.read(audio_file) # Extract MFCCs as features mfcc_features = psf.mfcc(sig, rate, numcep=40) return mfcc_features def train_voice_recognition_model(features, labels): # Train a Gaussian Mixture Model for voice recognition gmm = GaussianMixture(n_components=16) gmm.fit(features, labels) return gmm def predict_voice(gmm, features): # Predict if Moist Critical is speaking based on the trained Gaussian Mixture Model return gmm.predict(features) def find_moist_critical_voice(audio_files, gmm): moist_critical_features = [] moist_critical_timestamps = [] # Loop through all audio files for audio_file in audio_files: features = extract_features(audio_file) # Use the trained voice recognition model to determine if Moist Critical is speaking in the audio file is_moist_critical_speaking = predict_voice(gmm, features) # If Moist Critical is speaking, extract the corresponding timestamps and add to the list if is_moist_critical_speaking: moist_critical_features.append(features) # moist_critical_timestamps.append(get_timestamps_for_moist_critical_speech(audio_file)) return moist_critical_timestamps # Example usage: # First, gather features and labels for Moist Critical's voice moist_critical_features = ... # Extract features for Moist Critical's voice moist_critical_labels = ... # Label the features as Moist Critical's voice # Train the voice recognition model gmm = train_voice_recognition_model(moist_critical_features, moist_critical_labels) # Use the voice recognition model to find Moist Critical's voice in x audio files audio_files = ["Alleged.mp3"] ``` ### better model ``` import librosa import numpy as np import matplotlib.pyplot as plt import soundfile as sf # Load the trained model model = ... # Your trained model # Load an audio file filename = 'sample_audio.wav' samples, sr = librosa.load(filename) # Extract audio features from the audio file mfccs = librosa.feature.mfcc(y=samples, sr=sr, n_mfcc=40) # Compute the number of frames n_frames = mfccs.shape[1] # Compute the frame duration in seconds frame_duration = float(librosa.get_duration(y=samples, sr=sr) / n_frames) # Initialize the time stamp timestamp = 0 # Initialize a list to store the time stamps time_stamps = [] for i in range(n_frames): # Reshape the extracted features into a 2D array features = np.reshape(mfccs[:, i], (1, -1)) # Make a prediction using the trained model prediction = model.predict(features) # Check the prediction if prediction == "Moist Critical": time_stamps.append(timestamp) # Update the time stamp timestamp += frame_duration # Print the time stamps when Moist Critical is speaking print("Moist Critical is speaking at the following time stamps:") for t in time_stamps: print(t) ``` ### SPEECHBRAIN ``` import speech_brain import torch from torch.utils.data import DataLoader # Load the pre-trained model model = speech_brain.load("en-gb") # Define the training data train_data = [ { "text": "Good morning everyone, I am honored to be speaking with you today. Our company has been facing some difficult challenges, but I am confident that we will overcome them.", "timestamps": [ (0, 5), (20, 25), (60, 70) ] }, { "text": "Ladies and gentlemen, the state of our economy is not good. We need to take immediate action to turn things around. We cannot afford to wait any longer.", "timestamps": [ (0, 5), (30, 40), (60, 65) ] }, { "text": "It is my pleasure to announce that our new product line has been a huge success. Sales have exceeded all expectations and we are now expanding into new markets.", "timestamps": [ (0, 5), (35, 40), (70, 80) ] } ] ] # Define a custom dataset class TimestampDataset(torch.utils.data.Dataset): def __init__(self, data): self.data = data def __getitem__(self, index): return self.data[index] def __len__(self): return len(self.data) # Wrap the training data in the custom dataset dataset = TimestampDataset(train_data) # Define a data loader data_loader = DataLoader(dataset, batch_size=32, shuffle=True) # Define the loss function criterion = torch.nn.MSELoss() # Define the optimizer optimizer = torch.optim.Adam(model.parameters()) # Train the model for epoch in range(10): # number of training epochs for i, sample in enumerate(data_loader): text = sample["text"] target = sample["timestamps"] # Forward pass output = model(text) loss = criterion(output, target) # Backward pass optimizer.zero_grad() loss.backward() optimizer.step() # Print the loss every 100 steps if (i + 1) % 100 == 0: print(f"Epoch: {epoch + 1}/10, Step: {i + 1}/{len(data_loader)}, Loss: {loss.item()}") # Save the fine-tuned model model.save("timestamp-extraction-model.pth") # Load the fine-tuned model model = speech_brain.load("timestamp-extraction-model.pth") # Define the input speech transcript input_text = "This is a speech transcript containing critical talks." # Generate the timestamps for critical talks output = model(input_text) # Convert the output to timestamps timestamps = [] for i, score in enumerate(output): if score > 0.5: # threshold for detecting a critical talk start = i * 5 # assuming a 5 second interval end = start + 5 timestamps.append((start, end)) # Print the extracted timestamps print(timestamps) ``` ### Working Version ``` from speechbrain.pretrained import SpeakerRecognition verification = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb") file1 = 'speech_partitions/segment_1.wav' file2 = 'speech_partitions_at/segment_14.wav' score, prediction = verification.verify_files(file1, file2) print(score) print(prediction) # True = same speaker, False=Different speake ``` ### Librosa Version (isn't as accurate but is fast) ``` import librosa import numpy as np import os # Load the pre-recorded sample y1, sr1 = librosa.load("/Users/harshbhatia/Documents/CBD/Charlie_CBD/data_collection/Alleged.wav") # Extract features for the pre-recorded sample mfcc1 = librosa.feature.mfcc(y1, sr=sr1) mean1 = np.mean(mfcc1, axis=1) # Initialize a counter for the failed cases counter = 0 # Iterate over the files in the folder speech_partitions for file in os.listdir("/Users/harshbhatia/Documents/CBD/Charlie_CBD/data_collection/speech_partitions"): # Load the audio file y2, sr2 = librosa.load(os.path.join("/Users/harshbhatia/Documents/CBD/Charlie_CBD/data_collection/speech_partitions", file)) # Extract features for the audio file mfcc2 = librosa.feature.mfcc(y2, sr=sr2) mean2 = np.mean(mfcc2, axis=1) # Calculate the cosine similarity between the means similarity = np.dot(mean1, mean2) / (np.linalg.norm(mean1) * np.linalg.norm(mean2)) # Check if the cosine similarity is above a certain threshold if similarity < 0.99: counter += 1 print(similarity) print(f"The speaker is not the same person for file {file}") # Print the counter print(f"Number of failed cases: {counter}") ```

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.