2021-03-16
===
###### tags: `Chatbot`
- 형태소 분석기를 이용한 품사 분석 전처리
```python=
query_NVA_token_sentences = list()
answer_NVA_token_sentences = list()
for s in query_sentences:
for token, tag in okt.pos(s.replace(' ', '')):
if tag == 'Noun' or tag == 'Verb' or tag == 'Adjective':
query_NVA_token_sentences.append(token)
for s in answer_sentences:
for token, tag in okt.pos(s.replace(' ', '')):
if tag == 'Noun' or tag == 'Verb' or tag == 'Adjective':
answer_NVA_token_sentences.append(token)
query_NVA_token_sentences = ' '.join(query_NVA_token_sentences)
answer_NVA_token_sentences = ' '.join(answer_NVA_token_sentences)
```
- idx2char와 char2idx 딕셔너리 생성방법
```python=
def make_vocabulary(vocabulary_list):
# 리스트를 키가 단어이고 값이 인덱스인 딕셔너리를 만든다.
char2idx = {char: idx for idx, char in enumerate(vocabulary_list)}
# 리스트를 키가 인덱스이고 값이 단어인 딕셔너리를 만든다.
idx2char = {idx: char for idx, char in enumerate(vocabulary_list)}
return char2idx, idx2char
```
- encoder processing
```python=
def enc_processing(value, dictionary, tokenize_as_morph=False):
sequences_input_index = []
sequences_length = []
if tokenize_as_morph:
value = prepro_like_morphlized(value)
# 한줄씩 불어온다.
for sequence in value:
# FILTERS = "([~.,!?\"':;)(])"
sequence = re.sub(CHANGE_FILTER, "", sequence)
sequence_index = []
# 문장을 스페이스 단위로 자르고 있다.
for word in sequence.split():
# 잘려진 단어들이 딕셔너리에 존재 하는지 보고 그 값을 가져와 sequence_index에 추가한다.
if dictionary.get(word) is not None:
sequence_index.extend([dictionary[word]])
# 잘려진 단어가 딕셔너리에 존재 하지 않는 경우 이므로 UNK(2)를 넣어 준다.
else:
sequence_index.extend([dictionary[UNK]])
# 문장 제한 길이보다 길어질 경우 뒤에 토큰을 자르고 있다.
if len(sequence_index) > MAX_SEQUENCE:
sequence_index = sequence_index[:MAX_SEQUENCE]
# max_sequence_length보다 문장 길이가
# 작다면 빈 부분에 PAD(0)를 넣어준다.
sequence_index += (MAX_SEQUENCE - len(sequence_index)) * [dictionary[PAD]]
# 인덱스화 되어 있는 값을
# sequences_input_index에 넣어 준다.
sequences_input_index.append(sequence_index)
return np.asarray(sequences_input_index)
```
- decoder processing
- 디코더 입력값: `<SOS>, 그래, 오랜만이야, <PAD>`
- 디코더 타깃값: `그래, 오랜만이야, <END>, <PAD>`
- encodeer
```python=
class Encoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
super(Encoder, self).__init__()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
self.gru = tf.keras.layers.GRU(self.enc_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
def call(self, x, hidden):
x = self.embedding(x)
output, state = self.gru(x, initial_state = hidden)
return output, state
def initialize_hidden_state(self, inp):
return tf.zeros((tf.shape(inp)[0], self.enc_units))
```
- decoder
```python=
class Decoder(tf.keras.layers.Layer):
def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
super(Decoder, self).__init__()
self.batch_sz = batch_sz
self.dec_units = dec_units
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
self.gru = tf.keras.layers.GRU(self.dec_units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc = tf.keras.layers.Dense(self.vocab_size)
self.attention = BahdanauAttention(self.dec_units)
def call(self, x, hidden, enc_output):
context_vector, attention_weights = self.attention(hidden, enc_output)
x = self.embedding(x)
x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
output, state = self.gru(x)
output = tf.reshape(output, (-1, output.shape[2]))
x = self.fc(output)
return x, state, attention_weights
```
- attention
```python=
class BahdanauAttention(tf.keras.layers.Layer):
def __init__(self, units):
super(BahdanauAttention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, query, values):
hidden_with_time_axis = tf.expand_dims(query, 1)
print(values)
score = self.V(tf.nn.tanh(
self.W1(values) + self.W2(hidden_with_time_axis)))
attention_weights = tf.nn.softmax(score, axis=1)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
```
- seq2seq
```python=
class seq2seq(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, enc_units, dec_units, batch_sz, end_token_idx=2):
super(seq2seq, self).__init__()
self.end_token_idx = end_token_idx
self.encoder = Encoder(vocab_size, embedding_dim, enc_units, batch_sz)
self.decoder = Decoder(vocab_size, embedding_dim, dec_units, batch_sz)
def call(self, x):
inp, tar = x
enc_hidden = self.encoder.initialize_hidden_state(inp)
enc_output, enc_hidden = self.encoder(inp, enc_hidden)
dec_hidden = enc_hidden
predict_tokens = list()
for t in range(0, tar.shape[1]):
dec_input = tf.dtypes.cast(tf.expand_dims(tar[:, t], 1), tf.float32)
predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
predict_tokens.append(tf.dtypes.cast(predictions, tf.float32))
return tf.stack(predict_tokens, axis=1)
def inference(self, x):
inp = x
enc_hidden = self.encoder.initialize_hidden_state(inp)
enc_output, enc_hidden = self.encoder(inp, enc_hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([char2idx[std_index]], 1)
predict_tokens = list()
for t in range(0, MAX_SEQUENCE):
predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
predict_token = tf.argmax(predictions[0])
if predict_token == self.end_token_idx:
break
predict_tokens.append(predict_token)
dec_input = tf.dtypes.cast(tf.expand_dims([predict_token], 0), tf.float32)
return tf.stack(predict_tokens, axis=0).numpy()
```