# 機器學習-除噪
## 需要用到的模組
```python=
import os
import numpy as np
from scipy.io import wavfile
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn import metrics
import lightgbm
import time
import pickle
```
## 定義function取得音檔路徑
```python=
def get_filepaths(directory):
"""
This function will generate the file names in a directory
tree by walking the tree either top-down or bottom-up. For each
directory in the tree rooted at directory top (including top itself),
it yields a 3-tuple (dirpath, dirnames, filenames).
"""
file_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for root, directories, files in os.walk(directory):
for filename in files:
if filename.endswith('.wav'):
# Join the two strings in order to form the full filepath.
filepath = os.path.join(root, filename)
file_paths.append(filepath) # Add it to the list.
# pdb.set_trace()
file_paths.sort()
return file_paths # Self-explanatory.
```
```python=
mix_list = get_filepaths('mixed_pos_snr/')
clean_list = get_filepaths('clean/')
```
## 定義function讀取音檔
```python=
def load_files(lst):
train_filelist = []
min_length = float('inf')
for file in lst:
sr, wave = wavfile.read(file)
min_length = min(min_length,len(wave))
for file in lst:
sr, wave = wavfile.read(file)
if wave.dtype != 'float16':
wave = wave / 2**15
train_filelist.append(wave[:min_length])
return train_filelist
```
```python=
train_data = load_files(mix_list)
train_label = load_files(clean_list)
```
## 切分train-test data
```python=
X = np.array(train_data) #因為原本是list,轉乘np.array
Y = np.array(train_label)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1212,test_size=0.3)
```
## 開始train model
```python=
lgbm=MultiOutputRegressor(lightgbm.LGBMRegressor(n_jobs=8,n_estimators=64,max_depth=5)) #可以自己試試各種參數
for n in range(100,4101,100):
for i in range(36):
start_time = time.time()
lgbm.fit(X_train[n-100:n,i*1000:i*1000+1000], Y_train[n-100:n,i*1000:i*1000+1000],verbose=True)
end_time = time.time()
print(f'[{i+1}/36] {round(end_time-start_time,2)}')
filename = './sav/'+str(n)+'_model.sav'
pickle.dump(lgbm, open(filename, 'wb')) #每run 100筆資料就把model參數存起來
```
```python=
#lgbm = pickle.load(open(filename, 'rb'))
#可用這個讀取model
```
## 開始predict結果並存成音檔
```python=
results = [] #因為每次只train 1000個feature
for i in range(36): #所以也要分開predict,最後才結合
predictions = lgbm.predict(X_test[:1,i*1000:i*1000+1000])
results.append(predictions)
wavfile.write('result.wav',16000,np.hstack(results)[0])
```