Reference:
語言模型這 4 個字看似很博大精深,但是它僅僅指的就是「一個句子的機率」。
from collections import Counter, namedtuple
import json
import re
DATASET_DIR = './WebNews.json'
with open(DATASET_DIR, encoding = 'utf8') as f:
dataset = json.load(f)
seg_list = list(map(lambda d: d['detailcontent'], dataset))
rule = re.compile('[^\u4e00-\u9fa5]') #Unicode編碼的漢字範圍
NEW_seg_list = []
##(方法一)
NEW_seg_list = [rule.sub('', seg) for seg in seg_list]
##(方法二)
string = ''
for i in seg_list:
for j in i:
string += rule.sub('', j)
NEW_seg_list.append(string)
string = ''
print(NEW_seg_list[2])
def ngram(documents, N=2):
ngram_prediction = dict()
total_grams = list()
words = list()
Word = namedtuple('Word', ['word', 'prob']) #'Word'物件,其屬性包含'word', 'prob'
for doc in documents: #doc單篇語料
split_words = ['<s>'] + list(doc) + ['</s>']
# 計算分子
[total_grams.append(tuple(split_words[i:i+N])) for i in range(len(split_words)-N+1)]
# 計算分母
[words.append(tuple(split_words[i:i+N-1])) for i in range(len(split_words)-N+2)]
total_word_counter = Counter(total_grams)
word_counter = Counter(words)
for key in total_word_counter:
word = ''.join(key[:N-1])
if word not in ngram_prediction:
ngram_prediction.update({word: set()})
next_word_prob = total_word_counter[key]/word_counter[key[:N-1]]
w = Word(key[-1], '{:.3g}'.format(next_word_prob))
ngram_prediction[word].add(w)
return ngram_prediction
tri_prediction = ngram(NEW_seg_list, N=3)
for word, ng in tri_prediction.items():
tri_prediction[word] = sorted(ng, key=lambda x: x.prob, reverse=True)
text = '韓國'
next_words = list(tri_prediction[text])[:5]
for next_word in next_words:
print('next word: {}, probability: {}'.format(next_word.word, next_word.prob))
在前面的'韓國'出現過的情況下,後面的'隊'也一起出現的機率為何?
string = ''
for i in NEW_seg_list:
string += i
pattern1 = re.compile('韓國隊')
r1 = pattern1.findall(string)
print('\'韓國隊\':共有 {} 個。\n'.format(r1.count('韓國隊')), r1, '\n')
pattern2 = re.compile('韓國')
r2 = pattern2.findall(string)
print('\'韓國\':共有 {} 個。\n'.format(r2.count('韓國')), r2)
print('\n{} / {} = {}'.format(r1.count('韓國隊'), r2.count('韓國'), round(r1.count('韓國隊')/ r2.count('韓國'), 3)), '\n---\n')
'韓國隊':共有 5 個。
['韓國隊', '韓國隊', '韓國隊', '韓國隊', '韓國隊']
'韓國':共有 28 個。
['韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國', '韓國']
5 / 28 = 0.179
---
next word: 隊, probability: 0.179
next word: 及, probability: 0.179
next word: 明, probability: 0.0714
next word: 日, probability: 0.0714
next word: 音, probability: 0.0357