自然語言期末實驗

# 自然語言期末實驗 ## 實驗題目 ![截圖 2023-12-21 13.30.49](https://hackmd.io/_uploads/SJH9aH-Dp.png) ## 實驗要求 ![截圖 2023-12-21 13.31.15](https://hackmd.io/_uploads/B10iaSbwp.png) ![截圖 2023-12-21 13.31.23](https://hackmd.io/_uploads/Hkv3pB-DT.png) ![截圖 2023-12-21 13.31.37](https://hackmd.io/_uploads/SJB6pHZv6.png) ![截圖 2023-12-21 13.31.46](https://hackmd.io/_uploads/rJTaaBbPa.png) ![截圖 2023-12-21 13.31.55](https://hackmd.io/_uploads/Sk80pHWwp.png) ![截圖 2023-12-21 13.32.03](https://hackmd.io/_uploads/Byly0rZv6.png) ## 實驗代碼解釋 ### 將json文件轉換成txt文件 ```python3= import json def data_synthesis(source, target, new): src_obj = open(source, 'r') tar_obj = open(target, 'r') synthesis_obj = open(new, 'w') source_lines = [] target_lines = [] for src_line in src_obj.readlines(): source_lines.append(src_line) for tar_line in tar_obj.readlines(): target_lines.append(tar_line) for i in range(0, len(source_lines)): source_data = json.loads(source_lines[i]) target_data = json.loads(target_lines[i]) synthesis_obj.write(source_data['text'] + "\t" + target_data['text'] + '\n') src_obj.close() tar_obj.close() synthesis_obj.close() data_synthesis("data/train/src.jsonl", "data/train/target.jsonl", "data/train.txt") data_synthesis("data/test/src.jsonl", "data/test/target.jsonl", "data/test.txt") data_synthesis("data/validation/src.jsonl", "data/validation/target.jsonl", "data/validation.txt") ``` ### 導入庫 ```python3= import os import sys import math from collections import Counter import numpy as np import random import torch import torch.nn as nn import torch.nn.functional as F import nltk import jieba # nltk.download('punkt') # 下載 punkt ``` ### 1. 数据预处理 #### 1.1 读入中英文数据 * 英文使用nltk的 word_tokenizer 来分词，并且使用小写字母 * 中文直接使用单个汉字作为基本单元 ```python3= def load_data(in_file): en = [] cn = [] num_examples = 0 with open(in_file, 'r', encoding='utf-8') as lines: for line in lines: line = line.strip().split('\t') en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS']) cn.append(['BOS'] + jieba.lcut(line[1]) + ['EOS']) return en, cn train_file = 'data/train.txt' dev_file = 'data/test.txt' train_en, train_cn = load_data(train_file) dev_en, dev_cn = load_data(dev_file) ``` ### 查看返回的数据内容： ```python3= print(dev_en[:2]) print(dev_cn[:2]) ``` ![截圖 2023-12-21 14.05.28](https://hackmd.io/_uploads/Sk7nSIbvT.png) ### 1.2 构建单词表 ```python3= # 构建词典:利用分词后的结果构建统计词典，可以过滤掉出现频次较低的词语防止词典规模过大 UNK_IDX = 0 PAD_IDX = 1 def build_dict(sentences, max_words=50000): word_count = Counter() for sentence in sentences: for word in sentence: word_count[word] += 1 ls = word_count.most_common(max_words) # 词频前max_words个单词(降序) total_words = len(ls) + 2 word_dict = {w[0] : index + 2 for index, w in enumerate(ls)} # {单词:索引}, w[0]:单词, w[1]:词频 word_dict['UNK'] = UNK_IDX word_dict["PAD"] = PAD_IDX return word_dict, total_words en_dict, en_total_words = build_dict(train_en) cn_dict, cn_total_words = build_dict(train_cn) inv_en_dict = {v:k for k, v in en_dict.items()} inv_cn_dict = {v:k for k, v in cn_dict.items()} ``` ### 1.3 把单词全部转变成数字 ```python3= def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True): length = len(en_sentences) out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences] out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences] # sort sentences by word def len_argsort(seq): return sorted(range(len(seq)), key=lambda x: len(seq[x])) # 把中文和英文按照同样的顺序排序 if sort_by_len: sorted_index = len_argsort(out_en_sentences) out_en_sentences = [out_en_sentences[i] for i in sorted_index] out_cn_sentences = [out_cn_sentences[i] for i in sorted_index] return out_en_sentences, out_cn_sentences train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict) dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict) # [[2, 168, 201, 4, 3], [], ...., [2, 5, 14, 13, 22, 9, 149, 17, 107, 24, 121, 16, 20, 267, 7, 181, 23, 15, 6, 422, 25, 220, 4, 3]] ``` ### 查看返回的数据内容： ```python3= print(train_cn[2]) print([inv_cn_dict[i] for i in train_cn[2]]) print([inv_en_dict[i] for i in train_en[2]]) ``` ![截圖 2023-12-21 14.09.04](https://hackmd.io/_uploads/HyTYI8-Da.png) ![截圖 2023-12-21 14.09.22](https://hackmd.io/_uploads/S1fi88bPp.png) ### 1.4 把全部句子分成batch ```python3= def get_minibatches(n, minibatch_size, shuffle=True): # n是传进来的句子数 idx_list = np.arange(0, n, minibatch_size) # [0, 1, ..., n-1] 按minibatch_size大小分割 if shuffle: np.random.shuffle(idx_list) minibatches = [] for idx in idx_list: minibatches.append(np.arange(idx, min(idx + minibatch_size, n))) return minibatches ``` ### 查看上面函数的功能： ```python3= get_minibatches(100, 15) ``` ![截圖 2023-12-21 14.11.29](https://hackmd.io/_uploads/BJJQw8-vT.png) ```python3= def prepare_data(seqs): lengths = [len(seq) for seq in seqs] n_samples = len(seqs) # n_samples个句子 max_len = np.max(lengths) # batch_size个句子中最长句子长度 x = np.zeros((n_samples, max_len)).astype('int32') x_lengths = np.array(lengths).astype('int32') # batch中原始句子长度 for idx, seq in enumerate(seqs): x[idx, :lengths[idx]] = seq # lengths[idx]: 每个句子的索引, 长度不够补0 return x, x_lengths def gen_examples(en_sentences, cn_sentences, batch_size): minibatches = get_minibatches(len(en_sentences), batch_size) all_ex = [] for minibatch in minibatches: mb_en_sentences = [en_sentences[t] for t in minibatch] # 一个batch中每个句子的对应编码,[[[2, 982, 8], [14,5,6],...] mb_cn_sentences = [cn_sentences[t] for t in minibatch] mb_x, mb_x_len = prepare_data(mb_en_sentences) # 一个batch中每个句子的对应编码，长度不够补0; 一个batch中每个句子长度 mb_y, mb_y_len = prepare_data(mb_cn_sentences) all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len)) # 返回内容依次是 n / batch_size 个 (batch个句子编码，batch个英文句子长度，batch个中文句子编码，batch个中文句子长度) return all_ex batch_size = 64 train_data = gen_examples(train_en, train_cn, batch_size) dev_data = gen_examples(dev_en, dev_cn, batch_size) ``` ## 2. 定义计算损失的函数 ```python3= # masked cross entropy loss class LanguageModelCriterion(nn.Module): def __init__(self): super(LanguageModelCriterion, self).__init__() def forward(self, input, target, mask): # input: [64, 12, 3195] target: [64, 12] mask: [64, 12] # input: (batch_size * seq_len) * vocab_size input = input.contiguous().view(-1, input.size(2)) # target: batch_size * seq_len target = target.contiguous().view(-1, 1) mask = mask.contiguous().view(-1, 1) output = -input.gather(1, target) * mask # 将input在1维，把target当索引进行取值 #这里算得就是交叉熵损失，前面已经算了F.log_softmax #output.shape=torch.Size([768, 1]) #因为input.gather时，target为0的地方不是零了，mask作用是把padding为0的地方重置为零， #因为在volab里0代表的也是一个单词，但是我们这里target尾部的0代表的不是单词 output = torch.sum(output) / torch.sum(mask) # 均值损失，output前已经加了负号，所以这里还是最小化 return output ``` ## 3. 评估模型 ```python3= def evaluate(model, data): model.eval() total_num_words = total_loss = 0. with torch.no_grad(): for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data): mb_x = torch.from_numpy(mb_x).to(device).long() mb_x_len = torch.from_numpy(mb_x_len).to(device).long() mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long() mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long() mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long() mb_y_len[mb_y_len<=0] = 1 mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len) mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None] mb_out_mask = mb_out_mask.float() loss = loss_fn(mb_pred, mb_output, mb_out_mask) num_words = torch.sum(mb_y_len).item() total_loss += loss.item() * num_words total_num_words += num_words print("Evaluation loss", total_loss / total_num_words) ``` ## 4. Encoder Decoder模型（含Attention版本） ### 4.1 Encoder > Encoder模型的任务是把输入文字传入embedding层和GRU层，转换成一些hidden states作为后续的context vectors； ```python3= class Encoder(nn.Module): def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2): super(Encoder, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size) def forward(self, x, lengths): sorted_len, sorted_idx = lengths.sort(0, descending=True) x_sorted = x[sorted_idx.long()] embedded = self.dropout(self.embed(x_sorted)) packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True) packed_out, hid = self.rnn(packed_embedded) out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True) _, original_idx = sorted_idx.sort(0, descending=False) out = out[original_idx.long()].contiguous() hid = hid[:, original_idx.long()].contiguous() # hid: [2, batch_size, enc_hidden_size] hid = torch.cat([hid[-2], hid[-1]], dim=1) # 将最后一层的hid的双向拼接 # hid: [batch_size, 2*enc_hidden_size] hid = torch.tanh(self.fc(hid)).unsqueeze(0) # hid: [1, batch_size, dec_hidden_size] # out: [batch_size, seq_len, 2*enc_hidden_size] return out, hid ``` ### 4.2 Luong Attention ![Luong_Attention](https://hackmd.io/_uploads/BJ1CPIZwp.png) ![Luong_Attention參數說明](https://hackmd.io/_uploads/S1bAvUbwp.png) ```python3= class Attention(nn.Module): def __init__(self, enc_hidden_size, dec_hidden_size): # enc_hidden_size跟Encoder的一样 super(Attention, self).__init__() self.enc_hidden_size = enc_hidden_size self.dec_hidden_size = dec_hidden_size self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size, bias=False) self.linear_out = nn.Linear(enc_hidden_size*2 + dec_hidden_size, dec_hidden_size) def forward(self, output, context, mask): # mask = batch_size, output_len, context_len # mask在Decoder中创建好了 # output: batch_size, output_len, dec_hidden_size，就是Decoder的output # context: batch_size, context_len, 2*enc_hidden_size，就是Encoder的output # 这里Encoder网络是双向的，Decoder是单向的 batch_size = output.size(0) output_len = output.size(1) input_len = context.size(1) # input_len = context_len # 通过decoder的hidden states加上encoder的hidden states来计算一个分数，用于计算权重 # batch_size, context_len, dec_hidden_size # 第一步，公式里的Wa先与hs做点乘，把Encoder output的enc_hidden_size换成dec_hidden_size。 # Q: W·context context_in = self.linear_in(context.view(batch_size*input_len, -1)).view( batch_size, input_len, -1) # Q·K # context_in.transpose(1,2): batch_size, dec_hidden_size, context_len # output: batch_size, output_len, dec_hidden_size attn = torch.bmm(output, context_in.transpose(1,2)) # batch_size, output_len, context_len # 第二步，ht与上一步结果点乘，得到score attn.data.masked_fill(mask.bool(), -1e6) # .masked_fill作用请看这个链接：https://blog.csdn.net/candy134834/article/details/84594754 # mask的维度必须和attn维度相同，mask为1的位置对应attn的位置的值替换成-1e6， # mask为1的意义需要看Decoder函数里面的定义 attn = F.softmax(attn, dim=2) # batch_size, output_len, context_len # 这个dim=2到底是怎么softmax的看下下面单元格例子 # 第三步，计算每一个encoder的hidden states对应的权重。 # context: batch_size, context_len, 2*enc_hidden_size， context = torch.bmm(attn, context) # batch_size, output_len, 2*enc_hidden_size # 第四步，得出context vector是一个对于encoder输出的hidden states的一个加权平均 # output: batch_size, output_len, dec_hidden_size output = torch.cat((context, output), dim=2) # output：batch_size, output_len, 2*enc_hidden_size+dec_hidden_size # 第五步，将context vector和 decoder的hidden states 串起来。 output = output.view(batch_size*output_len, -1) # output.shape = (batch_size*output_len, 2*enc_hidden_size+dec_hidden_size) output = torch.tanh(self.linear_out(output)) # output.shape=(batch_size*output_len, dec_hidden_size) output = output.view(batch_size, output_len, -1) # output.shape=(batch_size, output_len, dec_hidden_size) # attn.shape = batch_size, output_len, context_len return output, attn ``` ### 4.3 Decoder > Decoder会根据已经翻译的句子内容和context vectors，来决定下一个输出的单词； ```python3= class Decoder(nn.Module): def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2): super(Decoder, self).__init__() self.embed = nn.Embedding(vocab_size, embed_size) self.attention = Attention(enc_hidden_size, dec_hidden_size) self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True) self.out = nn.Linear(dec_hidden_size, vocab_size) self.dropout = nn.Dropout(dropout) def create_mask(self, x_len, y_len): # x_len 是一个batch中文句子的长度列表 # y_len 是一个batch英文句子的长度列表 # a mask of shape x_len * y_len device = x_len.device max_x_len = x_len.max() max_y_len = y_len.max() x_mask = torch.arange(max_x_len, device=device)[None, :] < x_len[:, None] # print(x_mask.shape) = (batch_size, output_len) # 中文句子的mask y_mask = torch.arange(max_y_len, device=device)[None, :] < y_len[:, None] # print(y_mask.shape) = (batch_size, context_len) # 英文句子的mask mask = ( ~ x_mask[:, :, None] * y_mask[:, None, :]).byte() # mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte() # 1-说明取反 # x_mask[:, :, None] = (batch_size, output_len, 1) # y_mask[:, None, :] = (batch_size, 1, context_len) # print(mask.shape) = (batch_size, output_len, context_len) # 注意这个例子的*相乘不是torch.bmm矩阵点乘，只是用到了广播机制而已。 return mask def forward(self, encoder_out, x_lengths, y, y_lengths, hid): sorted_len, sorted_idx = y_lengths.sort(0, descending=True) y_sorted = y[sorted_idx.long()] hid = hid[:, sorted_idx.long()] y_sorted = self.dropout(self.embed(y_sorted)) # batch_size, output_length, embed_size packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True) out, hid = self.rnn(packed_seq, hid) unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True) _, original_idx = sorted_idx.sort(0, descending=False) output_seq = unpacked[original_idx.long()].contiguous() hid = hid[:, original_idx.long()].contiguous() mask = self.create_mask(y_lengths, x_lengths) # 这里真是坑，第一个参数位置是中文句子的长度列表 output, attn = self.attention(output_seq, encoder_out, mask) # output.shape=(batch_size, output_len, dec_hidden_size) # attn.shape = batch_size, output_len, context_len # self.out = nn.Linear(dec_hidden_size, vocab_size) output = F.log_softmax(self.out(output), -1) # 计算最后的输出概率 # output =(batch_size, output_len, vocab_size) # 最后一个vocab_size维度 log_softmax # hid.shape = (1, batch_size, dec_hidden_size) return output, hid, attn ``` ### 4.4 Seq2Seq > 最后我们构建Seq2Seq模型把encoder, attention, decoder串到一起 ```python3= class Seq2Seq(nn.Module): def __init__(self, encoder, decoder): super(Seq2Seq, self).__init__() self.encoder = encoder self.decoder = decoder def forward(self, x, x_lengths, y, y_lengths): encoder_out, hid = self.encoder(x, x_lengths) # print(hid.shape)=torch.Size([1, batch_size, dec_hidden_size]) # print(out.shape)=torch.Size([batch_size, seq_len, 2*enc_hidden_size]) output, hid, attn = self.decoder(encoder_out=encoder_out, x_lengths=x_lengths, y=y, y_lengths=y_lengths, hid=hid) # output =(batch_size, output_len, vocab_size) # hid.shape = (1, batch_size, dec_hidden_size) # attn.shape = (batch_size, output_len, context_len) return output, attn def translate(self, x, x_lengths, y, max_length=100): encoder_out, hid = self.encoder(x, x_lengths) preds = [] batch_size = x.shape[0] attns = [] for i in range(max_length): output, hid, attn = self.decoder(encoder_out, x_lengths, y, torch.ones(batch_size).long().to(y.device), hid) y = output.max(2)[1].view(batch_size, 1) preds.append(y) attns.append(attn) return torch.cat(preds, 1), torch.cat(attns, 1) ``` ## 5.1 训练模型 ```python3= def train(model, data, num_epochs=20): for epoch in range(num_epochs): model.train() # 训练模式 total_num_words = total_loss = 0. for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data): mb_x = torch.from_numpy(mb_x).to(device).long() mb_x_len = torch.from_numpy(mb_x_len).to(device).long() mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long() # EOS之前 mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long() # BOS之后 mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long() mb_y_len[mb_y_len <= 0] = 1 mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len) # [mb_y_len.max()]->[1, mb_y_len.max()] mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None] mb_out_mask = mb_out_mask.float() # (pre, target, mask) # mb_output是句子单词的索引 loss = loss_fn(mb_pred, mb_output, mb_out_mask) num_words = torch.sum(mb_y_len).item() total_loss += loss.item() * num_words total_num_words += num_words # 更新模型 optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.) optimizer.step() if it % 100 == 0: print("Epoch: ", epoch, 'iteration', it, 'loss:', loss.item()) print("Epoch", epoch, "Training loss", total_loss / total_num_words) if epoch % 5 == 0: evaluate(model, dev_data) torch.save(model.state_dict(), 'translate_model2.pt') ``` ### 5.2 训练函数并调用上面的train函数 ```python3= device = 'cpu' dropout = 0.2 embed_size = hidden_size = 100 encoder = Encoder(vocab_size=en_total_words, embed_size=embed_size, enc_hidden_size=hidden_size, dec_hidden_size=hidden_size, dropout=dropout) decoder = Decoder(vocab_size=cn_total_words, embed_size=embed_size, enc_hidden_size=hidden_size, dec_hidden_size=hidden_size, dropout=dropout) model = Seq2Seq(encoder, decoder) model = model.to(device) loss_fn = LanguageModelCriterion().to(device) optimizer = torch.optim.Adam(model.parameters()) train(model, train_data, num_epochs=100) ``` ![截圖 2023-12-21 14.16.56](https://hackmd.io/_uploads/HyIwOUbwT.png) ![截圖 2023-12-21 14.17.06](https://hackmd.io/_uploads/H1CPuU-DT.png) ## 6. 翻譯 ```python3= def translate_dev(i): en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]]) #原来的英文 print(en_sent) cn_sent = "".join([inv_cn_dict[w] for w in dev_cn[i]]) #原来的中文 print("".join(cn_sent)) # 一条句子 mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device) mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device) bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device) # shape:[1,1], [[2]] # y_lengths: [[2]], 一个句子 translation, attn = model.translate(mb_x, mb_x_len, bos) # [1, 10] # 映射成中文 translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)] trans = [] for word in translation: if word != "EOS": trans.append(word) else: break result = "".join(trans) print(result) #翻译后的中文 return cn_sent, result ``` ```python3= from nltk.translate.bleu_score import sentence_bleu def BLEU_evaluation(cn_sent, result): reference = cn_sent candidate = result score = sentence_bleu(reference, candidate, weights=(0, 0, 0, 1)) print('Individual 4-gram: %f' % score) ``` ```python3= model.load_state_dict(torch.load('translate_model2.pt', map_location=device)) for i in range(100, 120): (cn_sent, result) = translate_dev(i) score = BLEU_evaluation(cn_sent, result) ``` ### 6.2 結果輸出 ![截圖 2023-12-21 14.18.30](https://hackmd.io/_uploads/Hy7TuIZwp.png)