聲音處理 - HackMD

# 聲音處理 ## 大家都可以在下方留下code互相分享！！ ## 波形圖 #### librosa ```python= from pyhht.visualization import plot_imfs import matplotlib.pyplot as plt import librosa import librosa.display import IPython.display as ipd path = 'C:/Users/User/sabrina/sabrina_T01-1_0625.wav' data , sr = librosa.load(path,sr=16000) data = data/abs(data).max() plt.plot(data) ipd.Audio(data,rate=16000) #播放 ``` #### wave ```python= import wave import matplotlib.pyplot as plt import numpy as np # 讀檔 f = wave.open(r"C:/Users/User/sabrina/sabrina_T01-1_0625.wav", "rb") # 格式資訊 # (nchannels, sampwidth, framerate, nframes, comptype, compname) params = f.getparams() nchannels, sampwidth, framerate, nframes = params[:4] # 波形數據 str_data = f.readframes(nframes) f.close() # bytes to np.short wave_data = np.frombuffer(str_data, dtype=np.short) # 轉換為雙聲道 # wave_data.shape = -1, 2 # wave_data = wave_data.T # 總時間長度 time = np.arange(0, nframes) * (1.0 / framerate) # 音訊資訊 print(nchannels, sampwidth, framerate, nframes) #nchannels:聲道數 ;sampwidth:量化位數（byte） ;framerate(sample rate):採樣頻率 ; nframes:採樣點數 # 畫波形 plt.subplot(211) plt.plot(time, wave_data) # plt.plot(time, wave_data[0]) #雙聲道音檔的畫法 plt.subplot(212) plt.plot(time, wave_data, c="g") # plt.plot(time, wave_data[1], c="g")#雙聲道音檔的畫法 plt.xlabel("time (seconds)") plt.show() ``` <br> ## Spectrogram ```python= import librosa import librosa.display # 如果沒有 import 的話會 module 'librosa' has no attribute 'display' import numpy as np import matplotlib as plt import matplotlib.pyplot as plt # 如果沒有 import 的話會 module 'matplotlib' has no attribute 'title' path='C:/Users/User/sabrina/sabrina_T01-1_0625.wav' y, sr = librosa.load(path, sr=16000, mono=True) D = np.abs(librosa.stft(y)) librosa.display.specshow(librosa.amplitude_to_db(D,ref=np.max),x_axis='time',y_axis='hz',sr=sr) #sr=sr不然預設會是22050 plt.tight_layout() plt.savefig('C:/Users/User/pic/1.png') #儲存圖片 plt.show() ``` ## MFCC ```python= import librosa import librosa.display import matplotlib.pyplot as plt path='C:/Users/User/sabrina/sabrina_T01-1_0625.wav' y, sr = librosa.load(path,sr=16000,mono=True) mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) librosa.display.specshow(mfccs,y_axis='hz', x_axis='time',sr=sr) plt.colorbar() plt.title('MFCC') plt.tight_layout() ``` <br> ## mix_sound ```python= import librosa as lr import numpy as np import os #--------整理noise data------------------ noise_list = [] for file in os.listdir('/Users/rick/Desktop/noise_16000'): if '.wav' in file: noise_list.append(os.path.join('/Users/rick/Desktop/noise_16000',file)) #---------------------------------------- #---------整理clean data------------------ filename_list = [] for filename in os.listdir('/Users/rick/python_test/noise'): if '.' not in filename: filename_list.append(filename) file_list = [] for filename in filename_list: for file in os.listdir(os.path.join('/Users/rick/python_test/noise',filename)): if '.wav' in file: file_list.append(os.path.join('/Users/rick/python_test/noise',filename,file)) file_list.sort() #------------------------------------------ #-----------隨機混音開始--------------------- if not os.path.isdir('mixed_pos_snr'): os.mkdir('mixed_pos_snr') for file in file_list: for noise_index in range(len(noise_list)): snr = np.random.randint(0,16) #隨機取 snr的值 # noise_index = np.random.randint(0,len(noise_list)) cleanwav = file noisewav = noise_list[noise_index] noise,sr = lr.load(noisewav,sr=16000) # sr設置為None可取原檔案中的sample rate clean,Sr = lr.load(cleanwav,sr=16000) cleanenergy = np.sum(np.power(clean,2)) #np.power(a,2) = np.square(a) # 隨機索引與clean長度相同的noise信號 ind = np.random.randint(1, len(noise) - len(clean) + 1) noiselen=noise[ind:len(clean) + ind] # 噪声语音能量噪音的功率 noiseenergy = np.sum(np.power(noiselen,2)) # 噪声等级系数 noiseratio = np.sqrt((cleanenergy / noiseenergy) / (np.power(10, snr * 0.1))) # 隨機索引與clean長度相同的noise信號 noisyAudio = clean + noise[ind:len(clean)+ind] * noiseratio # write wav noisywavname='./mixed_pos_snr/'+cleanwav.split('/')[-1].rstrip('.wav')+'_'+noisewav.split('/')[-1].rstrip('.wav')+"_snr"+str(snr)+".wav" # sf.write(noisywavname, noisyAudio, 16000) lr.output.write_wav(noisywavname,noisyAudio,16000) ``` ## wav 與頻譜圖相互轉換 #### wav to spectrogram ```python= import scipy.io.wavfile rate,audData=scipy.io.wavfile.read('sabrina_T01-1_0622.wav') #---------------------------------------- #------------以下為可調整參數--------------- # Create FFT import numpy as np import math from PIL import Image import time FFT_LENGTH = 512 # 影響輸出圖片的高度 WINDOW_LENGTH = 256 # 影響輸出圖片的寬度 WINDOW_STEP = int(WINDOW_LENGTH / 2) magnitudeMin = float("inf") magnitudeMax = float("-inf") phaseMin = float("inf") phaseMax = float("-inf") #----------------------------------------- #-----------定義許多 function-------------- def amplifyMagnitudeByLog(d): return 188.301 * math.log10(d + 1) def weakenAmplifiedMagnitude(d): return math.pow(10, d/188.301)-1 def generateLinearScale(magnitudePixels, phasePixels, magnitudeMin, magnitudeMax, phaseMin, phaseMax): height = magnitudePixels.shape[0] width = magnitudePixels.shape[1] magnitudeRange = magnitudeMax - magnitudeMin phaseRange = phaseMax - phaseMin rgbArray = np.zeros((height, width, 3), 'uint8') for w in range(width): for h in range(height): magnitudePixels[h,w] = (magnitudePixels[h,w] - magnitudeMin) / (magnitudeRange) * 255 * 2 magnitudePixels[h,w] = amplifyMagnitudeByLog(magnitudePixels[h,w]) phasePixels[h,w] = (phasePixels[h,w] - phaseMin) / (phaseRange) * 255 red = 255 if magnitudePixels[h,w] > 255 else magnitudePixels[h,w] green = (magnitudePixels[h,w] - 255) if magnitudePixels[h,w] > 255 else 0 blue = phasePixels[h,w] rgbArray[h,w,0] = int(red) rgbArray[h,w,1] = int(green) rgbArray[h,w,2] = int(blue) return rgbArray def recoverLinearScale(rgbArray, magnitudeMin, magnitudeMax, phaseMin, phaseMax): width = rgbArray.shape[1] height = rgbArray.shape[0] magnitudeVals = rgbArray[:,:,0].astype(float) + rgbArray[:,:,1].astype(float) phaseVals = rgbArray[:,:,2].astype(float) phaseRange = phaseMax - phaseMin magnitudeRange = magnitudeMax - magnitudeMin for w in range(width): for h in range(height): phaseVals[h,w] = (phaseVals[h,w] / 255 * phaseRange) + phaseMin magnitudeVals[h,w] = weakenAmplifiedMagnitude(magnitudeVals[h,w]) magnitudeVals[h,w] = (magnitudeVals[h,w] / (255*2) * magnitudeRange) + magnitudeMin return magnitudeVals, phaseVals def generateSpectrogramForWave(signal): start_time = time.time() global magnitudeMin global magnitudeMax global phaseMin global phaseMax buffer = np.zeros(int(signal.size + WINDOW_STEP - (signal.size % WINDOW_STEP))) buffer[0:len(signal)] = signal height = int(FFT_LENGTH / 2 + 1) width = int(len(buffer) / (WINDOW_STEP) - 1) magnitudePixels = np.zeros((height, width)) phasePixels = np.zeros((height, width)) for w in range(width): buff = np.zeros(FFT_LENGTH) stepBuff = buffer[w*WINDOW_STEP:w*WINDOW_STEP + WINDOW_LENGTH] # apply hanning window stepBuff = stepBuff * np.hanning(WINDOW_LENGTH) buff[0:len(stepBuff)] = stepBuff #buff now contains windowed signal with step length and padded with zeroes to the end fft = np.fft.rfft(buff) for h in range(len(fft)): #print(buff.shape) #return magnitude = math.sqrt(fft[h].real**2 + fft[h].imag**2) if magnitude > magnitudeMax: magnitudeMax = magnitude if magnitude < magnitudeMin: magnitudeMin = magnitude phase = math.atan2(fft[h].imag, fft[h].real) if phase > phaseMax: phaseMax = phase if phase < phaseMin: phaseMin = phase magnitudePixels[height-h-1,w] = magnitude phasePixels[height-h-1,w] = phase rgbArray = generateLinearScale(magnitudePixels, phasePixels, magnitudeMin, magnitudeMax, phaseMin, phaseMax) elapsed_time = time.time() - start_time print('%.2f' % elapsed_time, 's', sep='') img = Image.fromarray(rgbArray, 'RGB') return img #----------------------------------------- #-----------wav轉為頻譜圖並存檔-------------- img = generateSpectrogramForWave(audData) img.save("../../denoised/spectrogram2.png","PNG") ``` #### spectrogram return to wav ```python= from PIL import Image import scipy.io.wavfile def recoverSignalFromSpectrogram(filePath): img = Image.open(filePath) data = np.array( img, dtype='uint8' ) width = data.shape[1] height = data.shape[0] magnitudeVals, phaseVals \ = recoverLinearScale(data, magnitudeMin, magnitudeMax, phaseMin, phaseMax) recovered = np.zeros(WINDOW_LENGTH * width // 2 + WINDOW_STEP, dtype=np.int16) for w in range(width): toInverse = np.zeros(height, dtype=np.complex_) for h in range(height): magnitude = magnitudeVals[height-h-1,w] phase = phaseVals[height-h-1,w] toInverse[h] = magnitude * math.cos(phase) + (1j * magnitude * math.sin(phase)) signal = np.fft.irfft(toInverse) recovered[w*WINDOW_STEP:w*WINDOW_STEP + WINDOW_LENGTH] += signal[:WINDOW_LENGTH].astype(np.int16) scipy.io.wavfile.write("../../denoised/recovered2.wav", 16000, recovered) recoverSignalFromSpectrogram("../../denoised/spectrogram2.png") ``` ##### 分批畫圖 ```python= for ind , i in enumerate(mixed_file[:5]): plt.clf() data , sr = librosa.load(i,sr=16000) data = data/abs(data).max() plt.plot(data) plt.savefig(f'./pic/{ind}.png') ```