# Xử lý file audio được ghi âm đến trộn nhiễu
- Cài đặt pyAudioAnalysis:
https://github.com/tyiannak/pyAudioAnalysis/wiki/2.-General
- Phát hiện các vùng tiêng nói trong file ghi âm -> cắt nguyên các đoạn tiếng nói được phát hiện và cho vào mảng tạm toàn số 0. dịch trái hoặc phải 1 đoạn ngãu nhiên. Code sample ở mục dưới [2]

- Thêm âm thanh nhiễu vào các file trên
```
git clone https://github.com/vpeopleonatank/example-transform-block-mix-noise
```
```
docker build -t transform-mix-noise .
DIR=/home/vpoat/Music/unknown/original
docker run --rm -it -v $DIR:/data transform-mix-noise --in-folder /data/data_set/ --out-directory /data/data_set_out/ --frequency 16000 --out-count 2
```
```
label=("khong" "mot" "hai" "ba" "bon" "nam" "bat" "tat" "truoc" "sau")
for l in ${label[@]}; do
docker run --rm -it -v $DIR:/data transform-mix-noise --in-folder /data/train/"$l" --out-directory /data/aug_train2/"$l" --frequency 16000 --out-count 3
done
```
- Thêm tiền tố vào file theo thư mục
```python=
import os
import shutil
base_dir = "/tmp/test_data" # Duong dan den thu muc test_data chu cac thu muc: ba bon nam ...
out_dir = "/tmp/out_data" # Thu muc chua nhan dau ra
if not os.path.exists(out_dir):
os.makedirs(out_dir)
for folder in os.listdir(base_dir):
sub_folder = os.path.join(base_dir, folder)
if os.path.isdir(sub_folder):
for file in os.listdir(sub_folder):
file_path = os.path.join(sub_folder, file)
if os.path.isfile(file_path) and 'wav' in os.path.splitext(file)[-1]:
shutil.copy(file_path, os.path.join(out_dir, f"{folder}.{file}"))
```
- Chuyển nhiều file .m4a sang .wav (câu lệnh cho linux)
```bash
for f in *.m4a; do ffmpeg -i "$f" -ar 16000 "${f%.m4a}.wav"; done
```
```
- Chuyển nhiều file stereo .wav sang mono .wav
```bash
for f in *.wav; do ffmpeg -i "$f" -ar 16000 -ac 1 "_$f"; done
```
- Chuyển nhiều file mp3 (mono và stereo) sang mono wav
```bash
for f in *.mp3; do ffmpeg -i "$f" -acodec pcm_s16le -ac 1 -ar 16000 "${f%.mp3}.wav"; done
```
- [2]
```python=
from pyAudioAnalysis import audioBasicIO as aIO
from pyAudioAnalysis import audioSegmentation as aS
from pydub import AudioSegment
from scipy.io import wavfile
import numpy as np
import random
import uuid
import os
def fix_over_duration(segment, segment_length, total_duration):
if segment[0] < 0:
segment[0] = 0
if segment_length > total_duration:
segment[1] = total_duration
else:
segment[1] = segment_length
elif segment[1] > total_duration:
segment[1] = total_duration
if total_duration < segment_length:
segment[0] = 0
else:
segment[0] = total_duration - segment_length
return segment
def shift_by_second(segment, total_duration, shift_direction, segment_length):
half_length = segment_length / 2
center_pos = (segment[0] + segment[1]) / 2
res_segment = [center_pos - half_length, center_pos + half_length]
res_segment = fix_over_duration(res_segment, segment_length, total_duration)
shift = random.uniform(0.05, 0.3)
if shift_direction == 'left':
shift = -shift
elif shift_direction == 'both':
direction = np.random.randint(0, 2)
# 0: left, 1: right
if direction == 0:
shift = -shift
res_segment = [res_segment[0] + shift, res_segment[1] + shift]
res_segment = fix_over_duration(res_segment, segment_length, total_duration)
if res_segment[0] == 1.0:
__import__('ipdb').set_trace()
return res_segment
SEGMENT_LENGTH = 1.0
if __name__ == "__main__":
base_path = "/mnt/Data/Project/speech_command_recognition/code/example_data/khai/Ba1.wav"
label_name = "ba"
out_dir = "/home/vpoat/ghq/github.com/vpeopleonatank/example-transform-block-mix-noise/data_set"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
[Fs, x] = aIO.read_audio_file(base_path)
# segments = aS.silence_removal(x, Fs, 0.020, 0.020, smooth_window = 0.1, weight
# = 0.2, plot = True) # Ba
# segments = aS.silence_removal(x, Fs, 0.020, 0.020, smooth_window = 0.3, weight
# = 0.5, plot = False) # Bat
print(x.shape)
segments = aS.silence_removal(x, Fs, 0.020, 0.020, smooth_window = 0.4, weight
= 0.4, plot = True) # Bon hai khong
# segments = aS.silence_removal(x, Fs, 0.020, 0.020, smooth_window = 0.3, weight
# = 0.5, plot = True) # mot nam sau tat truoc
print(len(segments))
total_duration = x.shape[0] / Fs
for iS, segment in enumerate(segments):
# adjusted_segment = shift_by_second(segment, total_duration, 'both',
# SEGMENT_LENGTH)
start_pos = int(Fs * segment[0])
end_pos = int(Fs * segment[1])
duration = end_pos - start_pos + 1
segment_data = x[start_pos: end_pos].copy()
secgment_data = np.pad(segment_data, (0, max(0, int(SEGMENT_LENGTH * Fs
- duration))), 'constant', constant_values=(0,0))
direction = np.random.randint(0, 2)
# 0 left, 1 right
if direction == 0:
shift = -int(random.uniform(0.0, 0.05) * Fs)
segment_data = np.roll(segment_data, shift)
segment_data[shift:] = 0
else:
shift = int(random.uniform(0.1, 0.6) * Fs)
segment_data = np.roll(segment_data, shift)
segment_data[:shift] = 0
wavfile.write(f"{out_dir}/{label_name}.{str(uuid.uuid4())[:5]}.wav", Fs,
(segment_data))
```
# Peak normalization
```bash
INPUT_DIR='/mnt/Data/Project/speech_command_recognition/Data/filtered/train_test_not_mixed_noise/train_set'
OUT_DIR='/mnt/Data/Project/speech_command_recognition/Data/filtered/train_test_not_mixed_noise_normalized/train_set'
for file in "$INPUT_DIR"/*
do
out_file="$(basename -- $file)"
ffmpeg-normalize $file --normalization-type peak --target-level 0 -f --output "$OUT_DIR/$out_file"
done
```