# Voice Identifier
```
import numpy as np
import sklearn
from sklearn.mixture import GaussianMixture
def extract_features(audio_file):
# Load the audio file
(rate,sig) = psf.wavfile.read(audio_file)
# Extract MFCCs as features
mfcc_features = psf.mfcc(sig, rate, numcep=40)
return mfcc_features
def train_voice_recognition_model(features, labels):
# Train a Gaussian Mixture Model for voice recognition
gmm = GaussianMixture(n_components=16)
gmm.fit(features, labels)
return gmm
def predict_voice(gmm, features):
# Predict if Moist Critical is speaking based on the trained Gaussian Mixture Model
return gmm.predict(features)
def find_moist_critical_voice(audio_files, gmm):
moist_critical_features = []
moist_critical_timestamps = []
# Loop through all audio files
for audio_file in audio_files:
features = extract_features(audio_file)
# Use the trained voice recognition model to determine if Moist Critical is speaking in the audio file
is_moist_critical_speaking = predict_voice(gmm, features)
# If Moist Critical is speaking, extract the corresponding timestamps and add to the list
if is_moist_critical_speaking:
moist_critical_features.append(features)
# moist_critical_timestamps.append(get_timestamps_for_moist_critical_speech(audio_file))
return moist_critical_timestamps
# Example usage:
# First, gather features and labels for Moist Critical's voice
moist_critical_features = ... # Extract features for Moist Critical's voice
moist_critical_labels = ... # Label the features as Moist Critical's voice
# Train the voice recognition model
gmm = train_voice_recognition_model(moist_critical_features, moist_critical_labels)
# Use the voice recognition model to find Moist Critical's voice in x audio files
audio_files = ["Alleged.mp3"]
```
### better model
```
import librosa
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
# Load the trained model
model = ... # Your trained model
# Load an audio file
filename = 'sample_audio.wav'
samples, sr = librosa.load(filename)
# Extract audio features from the audio file
mfccs = librosa.feature.mfcc(y=samples, sr=sr, n_mfcc=40)
# Compute the number of frames
n_frames = mfccs.shape[1]
# Compute the frame duration in seconds
frame_duration = float(librosa.get_duration(y=samples, sr=sr) / n_frames)
# Initialize the time stamp
timestamp = 0
# Initialize a list to store the time stamps
time_stamps = []
for i in range(n_frames):
# Reshape the extracted features into a 2D array
features = np.reshape(mfccs[:, i], (1, -1))
# Make a prediction using the trained model
prediction = model.predict(features)
# Check the prediction
if prediction == "Moist Critical":
time_stamps.append(timestamp)
# Update the time stamp
timestamp += frame_duration
# Print the time stamps when Moist Critical is speaking
print("Moist Critical is speaking at the following time stamps:")
for t in time_stamps:
print(t)
```
### SPEECHBRAIN
```
import speech_brain
import torch
from torch.utils.data import DataLoader
# Load the pre-trained model
model = speech_brain.load("en-gb")
# Define the training data
train_data = [
{
"text": "Good morning everyone, I am honored to be speaking with you today. Our company has been facing some difficult challenges, but I am confident that we will overcome them.",
"timestamps": [
(0, 5),
(20, 25),
(60, 70)
]
},
{
"text": "Ladies and gentlemen, the state of our economy is not good. We need to take immediate action to turn things around. We cannot afford to wait any longer.",
"timestamps": [
(0, 5),
(30, 40),
(60, 65)
]
},
{
"text": "It is my pleasure to announce that our new product line has been a huge success. Sales have exceeded all expectations and we are now expanding into new markets.",
"timestamps": [
(0, 5),
(35, 40),
(70, 80)
]
}
]
]
# Define a custom dataset
class TimestampDataset(torch.utils.data.Dataset):
def __init__(self, data):
self.data = data
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return len(self.data)
# Wrap the training data in the custom dataset
dataset = TimestampDataset(train_data)
# Define a data loader
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
# Define the loss function
criterion = torch.nn.MSELoss()
# Define the optimizer
optimizer = torch.optim.Adam(model.parameters())
# Train the model
for epoch in range(10): # number of training epochs
for i, sample in enumerate(data_loader):
text = sample["text"]
target = sample["timestamps"]
# Forward pass
output = model(text)
loss = criterion(output, target)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Print the loss every 100 steps
if (i + 1) % 100 == 0:
print(f"Epoch: {epoch + 1}/10, Step: {i + 1}/{len(data_loader)}, Loss: {loss.item()}")
# Save the fine-tuned model
model.save("timestamp-extraction-model.pth")
# Load the fine-tuned model
model = speech_brain.load("timestamp-extraction-model.pth")
# Define the input speech transcript
input_text = "This is a speech transcript containing critical talks."
# Generate the timestamps for critical talks
output = model(input_text)
# Convert the output to timestamps
timestamps = []
for i, score in enumerate(output):
if score > 0.5: # threshold for detecting a critical talk
start = i * 5 # assuming a 5 second interval
end = start + 5
timestamps.append((start, end))
# Print the extracted timestamps
print(timestamps)
```
### Working Version
```
from speechbrain.pretrained import SpeakerRecognition
verification = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
file1 = 'speech_partitions/segment_1.wav'
file2 = 'speech_partitions_at/segment_14.wav'
score, prediction = verification.verify_files(file1, file2)
print(score)
print(prediction) # True = same speaker, False=Different speake
```
### Librosa Version (isn't as accurate but is fast)
```
import librosa
import numpy as np
import os
# Load the pre-recorded sample
y1, sr1 = librosa.load("/Users/harshbhatia/Documents/CBD/Charlie_CBD/data_collection/Alleged.wav")
# Extract features for the pre-recorded sample
mfcc1 = librosa.feature.mfcc(y1, sr=sr1)
mean1 = np.mean(mfcc1, axis=1)
# Initialize a counter for the failed cases
counter = 0
# Iterate over the files in the folder speech_partitions
for file in os.listdir("/Users/harshbhatia/Documents/CBD/Charlie_CBD/data_collection/speech_partitions"):
# Load the audio file
y2, sr2 = librosa.load(os.path.join("/Users/harshbhatia/Documents/CBD/Charlie_CBD/data_collection/speech_partitions", file))
# Extract features for the audio file
mfcc2 = librosa.feature.mfcc(y2, sr=sr2)
mean2 = np.mean(mfcc2, axis=1)
# Calculate the cosine similarity between the means
similarity = np.dot(mean1, mean2) / (np.linalg.norm(mean1) * np.linalg.norm(mean2))
# Check if the cosine similarity is above a certain threshold
if similarity < 0.99:
counter += 1
print(similarity)
print(f"The speaker is not the same person for file {file}")
# Print the counter
print(f"Number of failed cases: {counter}")
```