2021-03-16 - HackMD

2021-03-16 === ###### tags: `Chatbot` - 형태소 분석기를 이용한 품사 분석 전처리 ```python= query_NVA_token_sentences = list() answer_NVA_token_sentences = list() for s in query_sentences: for token, tag in okt.pos(s.replace(' ', '')): if tag == 'Noun' or tag == 'Verb' or tag == 'Adjective': query_NVA_token_sentences.append(token) for s in answer_sentences: for token, tag in okt.pos(s.replace(' ', '')): if tag == 'Noun' or tag == 'Verb' or tag == 'Adjective': answer_NVA_token_sentences.append(token) query_NVA_token_sentences = ' '.join(query_NVA_token_sentences) answer_NVA_token_sentences = ' '.join(answer_NVA_token_sentences) ``` - idx2char와 char2idx 딕셔너리 생성방법 ```python= def make_vocabulary(vocabulary_list): # 리스트를 키가 단어이고 값이 인덱스인 딕셔너리를 만든다. char2idx = {char: idx for idx, char in enumerate(vocabulary_list)} # 리스트를 키가 인덱스이고 값이 단어인 딕셔너리를 만든다. idx2char = {idx: char for idx, char in enumerate(vocabulary_list)} return char2idx, idx2char ``` - encoder processing ```python= def enc_processing(value, dictionary, tokenize_as_morph=False): sequences_input_index = [] sequences_length = [] if tokenize_as_morph: value = prepro_like_morphlized(value) # 한줄씩 불어온다. for sequence in value: # FILTERS = "([~.,!?\"':;)(])" sequence = re.sub(CHANGE_FILTER, "", sequence) sequence_index = [] # 문장을 스페이스 단위로 자르고 있다. for word in sequence.split(): # 잘려진 단어들이 딕셔너리에 존재 하는지 보고 그 값을 가져와 sequence_index에 추가한다. if dictionary.get(word) is not None: sequence_index.extend([dictionary[word]]) # 잘려진 단어가 딕셔너리에 존재 하지 않는 경우 이므로 UNK(2)를 넣어 준다. else: sequence_index.extend([dictionary[UNK]]) # 문장 제한 길이보다 길어질 경우 뒤에 토큰을 자르고 있다. if len(sequence_index) > MAX_SEQUENCE: sequence_index = sequence_index[:MAX_SEQUENCE] # max_sequence_length보다 문장 길이가 # 작다면 빈 부분에 PAD(0)를 넣어준다. sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]] # 인덱스화 되어 있는 값을 # sequences_input_index에 넣어 준다. sequences_input_index.append(sequence_index) return np.asarray(sequences_input_index) ``` - decoder processing - 디코더 입력값: `<SOS>, 그래, 오랜만이야, <PAD>` - 디코더 타깃값: `그래, 오랜만이야, <END>, <PAD>` - encodeer ```python= class Encoder(tf.keras.layers.Layer): def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz): super(Encoder, self).__init__() self.batch_sz = batch_sz self.enc_units = enc_units self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim) self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') def call(self, x, hidden): x = self.embedding(x) output, state = self.gru(x, initial_state = hidden) return output, state def initialize_hidden_state(self, inp): return tf.zeros((tf.shape(inp)[0], self.enc_units)) ``` - decoder ```python= class Decoder(tf.keras.layers.Layer): def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz): super(Decoder, self).__init__() self.batch_sz = batch_sz self.dec_units = dec_units self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim) self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform') self.fc = tf.keras.layers.Dense(self.vocab_size) self.attention = BahdanauAttention(self.dec_units) def call(self, x, hidden, enc_output): context_vector, attention_weights = self.attention(hidden, enc_output) x = self.embedding(x) x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1) output, state = self.gru(x) output = tf.reshape(output, (-1, output.shape[2])) x = self.fc(output) return x, state, attention_weights ``` - attention ```python= class BahdanauAttention(tf.keras.layers.Layer): def __init__(self, units): super(BahdanauAttention, self).__init__() self.W1 = tf.keras.layers.Dense(units) self.W2 = tf.keras.layers.Dense(units) self.V = tf.keras.layers.Dense(1) def call(self, query, values): hidden_with_time_axis = tf.expand_dims(query, 1) print(values) score = self.V(tf.nn.tanh( self.W1(values) + self.W2(hidden_with_time_axis))) attention_weights = tf.nn.softmax(score, axis=1) context_vector = attention_weights * values context_vector = tf.reduce_sum(context_vector, axis=1) return context_vector, attention_weights ``` - seq2seq ```python= class seq2seq(tf.keras.Model): def __init__(self, vocab_size, embedding_dim, enc_units, dec_units, batch_sz, end_token_idx=2): super(seq2seq, self).__init__() self.end_token_idx = end_token_idx self.encoder = Encoder(vocab_size, embedding_dim, enc_units, batch_sz) self.decoder = Decoder(vocab_size, embedding_dim, dec_units, batch_sz) def call(self, x): inp, tar = x enc_hidden = self.encoder.initialize_hidden_state(inp) enc_output, enc_hidden = self.encoder(inp, enc_hidden) dec_hidden = enc_hidden predict_tokens = list() for t in range(0, tar.shape[1]): dec_input = tf.dtypes.cast(tf.expand_dims(tar[:, t], 1), tf.float32) predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output) predict_tokens.append(tf.dtypes.cast(predictions, tf.float32)) return tf.stack(predict_tokens, axis=1) def inference(self, x): inp = x enc_hidden = self.encoder.initialize_hidden_state(inp) enc_output, enc_hidden = self.encoder(inp, enc_hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([char2idx[std_index]], 1) predict_tokens = list() for t in range(0, MAX_SEQUENCE): predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output) predict_token = tf.argmax(predictions[0]) if predict_token == self.end_token_idx: break predict_tokens.append(predict_token) dec_input = tf.dtypes.cast(tf.expand_dims([predict_token], 0), tf.float32) return tf.stack(predict_tokens, axis=0).numpy() ```