## Libraries
```
import pysrt
import pandas as pd
import textacy.preprocessing as tprep
import re
import faiss
from sentence_transformers import SentenceTransformer
import time
import gensim
from nltk.corpus import stopwords
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from rouge_score import rouge_scorer
```
## Import Data
```
subs = pysrt.open("frozen.srt")
DF = pd.DataFrame([
{
"Text": sub.text,
"Start": sub.start.seconds,
"End": sub.end.seconds
} for sub in subs])
DF.head()
```
## Clean Up Text
```
def normalize(text):
text = tprep.normalize.hyphenated_words(text)
text = tprep.normalize.quotation_marks(text)
text = tprep.normalize.unicode(text)
text = tprep.remove.accents(text)
text = re.sub("\\n", " ", text)
return text
DF["Normalized"] = DF["Text"].apply(normalize)
DF["Normalized"]
```
## Create Search Engine
```
# Load a pre-trained model
model = SentenceTransformer('msmarco-MiniLM-L-12-v3')
search_data = DF["Normalized"].tolist()
# embed frozen
search_data_embds = model.encode(search_data)
# Create an index using FAISS
index = faiss.IndexFlatL2(search_data_embds.shape[1])
index.add(search_data_embds)
faiss.write_index(index, 'index_search_data')
index = faiss.read_index('index_search_data')
# define a search
def search(query, k):
t=time.time()
query_vector = model.encode([query])
top_k = index.search(query_vector, k)
print('totaltime: {}'.format(time.time()-t))
return [search_data[_id] for _id in top_k[1].tolist()[0]]
```
## Search
```
print(search("machine", 5))
print(search("what is this movie about", 5))
# a couple more on your own
# be sure to answer the question from the assignment
```
## Summarization
- Create a human summary of the text.
- Create text summaries using LSA, TextRank, and Topic Modeling.
- Assess those summaries using the Rouge-N analyzer.
- Which summary was the best when compared to the human summary?
### Topic Modeling
```
# Create a dictionary representation of the documents.
# load the list of documents
processed_sentences = [nltk.word_tokenize(row) for row in DF["Normalized"].to_list()]
dictionary = Dictionary(processed_sentences)
# Filter infrequent or too frequent words.
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(summary) for summary in processed_sentences]
```
```
# Train the topic model
LDAmodel = LdaModel(corpus = corpus,
id2word = dictionary,
iterations = 400,
num_topics = 10,
random_state = 100,
update_every = 1,
chunksize = 100,
passes = 10,
alpha = 'auto',
per_word_topics = True)
top_topics = list(LDAmodel.top_topics(corpus))
print(top_topics)
```
```
probs = [LDAmodel.get_document_topics(sentence) for sentence in corpus]
save_probs = []
i = 0
for document in probs:
for (topic, prob) in document:
if topic == 0: #topic 0 is the best representation
save_probs.append((DF["Normalized"].to_list()[i], prob))
i = i + 1
DF_two = pd.DataFrame(save_probs, columns = ["sentence", "prob"])
" ".join(DF_two.sort_values(by = ["prob"], ascending = False)[0:3]['sentence'])
TMsummary = " ".join(DF_two.sort_values(by = ["prob"], ascending = False)[0:3]['sentence'])
```
### Text Rank
```
# set up
LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
wholetext = " ".join(DF['Normalized'].to_list())
num_summary_sentence = 3
# make summary
parser = PlaintextParser.from_string(wholetext, Tokenizer(LANGUAGE))
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, num_summary_sentence):
print (str(sentence))
TRsummary = ' '.join(str(sentence) for sentence in summarizer(parser.document, num_summary_sentence))
```
### Assessment
```
gold_standard = "Armed with every weapon they can get their hands on, the Expendables are the world's last line of defense and the team that gets called when all other options are off the table."
def print_rouge_score(rouge_score):
for k,v in rouge_score.items():
print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure))
```
```
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
scores = scorer.score(gold_standard, TRsummary)
print_rouge_score(scores)
scores = scorer.score(gold_standard, TMsummary)
print_rouge_score(scores)
```
# Classification
## Libraries
```
# feature extraction section
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download("punkt")
mystopwords = set(stopwords.words("english"))
from string import punctuation
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
# machine learning part
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
```
## Functions
```
import gensim
from gensim.models import Word2Vec
#create flattening function
import numpy as np
def document_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index_to_key)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
```
### Algorithm Functions
```
# algorithm functions
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import eli5
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
```
## Set up Text
- Do preprocessing on your text to prepare it for the final machine learning model.
- What items do you think will be important in your preprocessing to clean?
```
# be sure to update
DF = pd.read_csv("small_data.csv")
DF.head()
```
```
def text_clean(text):
# lower case
text = text.lower()
# numbers
text = " ".join([word for word in nltk.word_tokenize(text) if not word.isdigit() and word not in punctuation])
# stop words
text = " ".join([word for word in nltk.word_tokenize(text) if word not in mystopwords])
# stemming
text = stemmer.stem(text)
return text.strip()
```
```
DF['Normalized'] = DF['text'].apply(text_clean)
```
Create Feature Extractions
Create a “one-hot” encoding using the count vectorizer and binary options.
Create the bag of words encoding using the count vectorizer.
Create the TF-IDF normalization using the tfidf vectorizer.
Create two word2vec models:
Using 100 dimensions.
Using cbow and skipgram embeddings.
Using a 5 window size.
## Split the Data
```
# split the data
x_train, x_test, y_train, y_test = train_test_split(DF['Normalized'], DF['humor'],
test_size = .20, random_state = 4320)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
```
### One Hot
```
# one hot matrix
onehot_vectorizer = CountVectorizer(binary=True)
# convert the training data into words by documents matrix
# only words seen during training will be useable
oh_train = onehot_vectorizer.fit_transform(x_train)
# makes this the same vocabulary and the same size
oh_test = onehot_vectorizer.transform(x_test)
print(oh_train.shape)
print(oh_test.shape)
print(oh_train[0:10])
```
### Bag of Words
```
# bag of words
bow_vectorizer = CountVectorizer()
# convert the training data into words by documents matrix
# only words seen during training will be useable
bow_train = bow_vectorizer.fit_transform(x_train)
# makes this the same vocabulary and the same size
bow_test = bow_vectorizer.transform(x_test)
print(bow_train.shape)
print(bow_test.shape)
print(bow_train[0:10])
```
### TF-IDF
```
# tf-idf
tf_vectorizer = TfidfVectorizer()
# convert the training data into words by documents matrix
# only words seen during training will be useable
tf_train = tf_vectorizer.fit_transform(x_train)
# makes this the same vocabulary and the same size
tf_test = tf_vectorizer.transform(x_test)
print(tf_train.shape)
print(tf_test.shape)
print(tf_train[0:10])
```
### Word2vec
```
# train the model on the training data only
our_model = Word2Vec(x_train,
vector_size = 1000, #dimensions
window = 5, #window size
sg = 0, #cbow
min_count = 1,
workers = 4)
# generate averaged word vector features from word2vec model
avg_wv_train_features = document_vectorizer(corpus = x_train,
model = our_model,
num_features = 1000)
# generate averaged word vector features from word2vec model
avg_wv_test_features = document_vectorizer(corpus = x_test,
model = our_model,
num_features = 1000)
```
```
# train the model on the training data only
our_model_sg = Word2Vec(x_train,
vector_size = 1000, #dimensions
window = 5, #window size
sg = 1, #cbow
min_count = 1,
workers = 4)
# generate averaged word vector features from word2vec model
avg_wv_train_features_sg = document_vectorizer(corpus = x_train,
model = our_model_sg,
num_features = 1000)
# generate averaged word vector features from word2vec model
avg_wv_test_features_sg = document_vectorizer(corpus = x_test,
model = our_model_sg,
num_features = 1000)
```
## Classification Models
Classify
Use at least two classification algorithms to predict the outcome of the data.
### SVM
```
oh_svm = LinearSVC()
oh_svm.fit(oh_train, y_train)
oh_y_svm = oh_svm.predict(oh_test)
bow_svm = LinearSVC()
bow_svm.fit(bow_train, y_train)
bow_y_svm = bow_svm.predict(bow_test)
tf_svm = LinearSVC()
tf_svm.fit(tf_train, y_train)
tf_y_svm = tf_svm.predict(tf_test)
wv_c_svm = LinearSVC()
wv_c_svm.fit(avg_wv_train_features, y_train)
wv_c_y_svm = wv_c_svm.predict(avg_wv_test_features)
wv_s_svm = LinearSVC()
wv_s_svm.fit(avg_wv_train_features_sg, y_train)
wv_s_y_svm = wv_s_svm.predict(avg_wv_test_features_sg)
```
### Bayes
```
#bayes
oh_bayes = MultinomialNB()
oh_bayes.fit(oh_train, y_train)
oh_y_bayes = oh_bayes.predict(oh_test)
bow_bayes = MultinomialNB()
bow_bayes.fit(bow_train, y_train)
bow_y_bayes = bow_bayes.predict(bow_test)
tf_bayes = MultinomialNB()
tf_bayes.fit(tf_train, y_train)
tf_y_bayes = tf_bayes.predict(tf_test)
```
- Deal with negatives in the wv output
- Find min + to all matrices
```
avg_wv_train_features.min()
avg_wv_test_features.min()
avg_wv_train_features_sg.min()
avg_wv_test_features_sg.min()
avg_wv_train_features = avg_wv_train_features + 1
avg_wv_test_features = avg_wv_test_features + 1
avg_wv_train_features_sg = avg_wv_train_features_sg + 1
avg_wv_test_features_sg = avg_wv_test_features_sg + 1
```
- Then run the algorithm. This is ONLY a bayes problem.
```
wv_c_bayes = MultinomialNB()
wv_c_bayes.fit(avg_wv_train_features, y_train)
wv_c_y_bayes = wv_c_bayes.predict(avg_wv_test_features)
wv_s_bayes = MultinomialNB()
wv_s_bayes.fit(avg_wv_train_features_sg, y_train)
wv_s_y_bayes = wv_s_bayes.predict(avg_wv_test_features_sg)
```
Include the model assessment of these predictions for all models.
```
print(classification_report(y_test, oh_y_svm))
print(classification_report(y_test, bow_y_svm))
print(classification_report(y_test, tf_y_svm))
print(classification_report(y_test, wv_c_y_svm))
print(classification_report(y_test, wv_s_y_svm))
print(classification_report(y_test, oh_y_bayes))
print(classification_report(y_test, bow_y_bayes))
print(classification_report(y_test, tf_y_bayes))
print(classification_report(y_test, wv_c_y_bayes))
print(classification_report(y_test, wv_s_y_bayes))
```
Write a paragraph summarizing the results from your comparisons. What models are best? Are there any general differences/similarities in prediction you see? How well is each category label classified? What might you do to make the model better?
- accuracy
- precision and recall second
- simpler / computation faster models preferred
- if all equal pick your favorite
If you want, you can switch out one of the classification algorithms with a deep learning model.
### Undersampling
```
# undersample example
DF_false = DF[DF['humor'] == False]
DF_false.count()
DF_true = DF[DF['humor'] == True]
DF_true.count()
# let's say df_true is the "big one"
DF_true_small = DF_true.sample(n = 10000)
# DF_true_small = DF_true.sample(n = DF_false.count())
DF_true_small.count()
DF_small = pd.concat([DF_false, DF_true_small])
DF_small.count()
# faster! use only this code
DF_test = DF_small.groupby('humor').sample(10000)
DF_test['humor'].value_counts()
```
## Interpretation
- Use eli5 to determine what predicts each category label.
- Interpret the results by writing a paragraph explaining the output from this package.
- estimator is the fitted model name (whatever you used MODEL.fit(DATA) with)
- feature_names are from function that you used to create the training/testing data (NOT the saved data output) ... FUNCTION.fit_transform()
- limitation: bayes not supported
```
eli5.show_weights(estimator = tf_svm, top = 10, feature_names = tf_vectorizer.get_feature_names_out())
```
```
explainer = LimeTextExplainer(class_names = y_train.sort_values().unique())
explainer
from sklearn.pipeline import make_pipeline
# import new data
# use text_clean
# use this pipeline to predict the new instance
# into pipeline goes the functions to transform and model the data
pipeline = make_pipeline(tf_vectorizer, tf_bayes)
pipeline.predict_proba(["this is funny"])
#pipeline.predict(["this is funny"])
exp = explainer.explain_instance(DF['Normalized'][5],
pipeline.predict_proba, num_features=10)
exp.as_pyplot_figure()
plt.show()
exp.save_to_file('example.html')
```
# Chatbots
- OpenSubtitles: https://opus.nlpl.eu/OpenSubtitles-v2018.php
## Find Text
- As a class, let’s train a chatbot on a movie. Pick a set of subtitles for the chatbot.
We are going to use Titanic.
## Training
- After training the chatbot on basic English, use the movie and the list trainer to train the chatbot.
### Libraries
- Be sure to install chatterbot-corpus
- chatterbot version 1.0.0
- https://www.reddit.com/r/learnpython/comments/14o9tbw/sqlalchemy_update_error/?rdt=62748
- python-dateutil update to 2.8.2 in datalore
```
# packages
from chatterbot import ChatBot
from chatterbot.trainers import ChatterBotCorpusTrainer
import pysrt
import pandas as pd
from chatterbot.trainers import ListTrainer
```
### Training
```
# start a chatbot
chatbot = ChatBot('Sad Leo')
# Create a new trainer for the chatbot
trainer = ChatterBotCorpusTrainer(chatbot)
```
```
# Train the chatbot based on the english corpus
trainer.train("chatterbot.corpus.english")
# Train based on english greetings corpus
trainer.train("chatterbot.corpus.english.greetings")
# Train based on the english conversations corpus
trainer.train("chatterbot.corpus.english.conversations")
```
### Load Data
```
subs = pysrt.open("titanic.srt")
DF = pd.DataFrame([
{
"Text": sub.text,
"Start": sub.start.seconds,
"End": sub.end.seconds
} for sub in subs])
DF.head()
```
### Use data as training
```
conversation = [
"Hello",
"Hi there!",
"How are you doing?",
"I'm doing great.",
"That is good to hear",
"Thank you.",
"You're welcome."
]
trainer = ListTrainer(chatbot)
trainer.train(conversation)
chatbot.get_response("Hello")
chatbot.get_response("How are you doing?")
```
## Test the Chatbot
- Using lines from the movie, test the chatbot responses.
But this is sad at new python ... so just submit it.
## Last Week Random Stuff
```
urllist = [
"https://www.booking.com/articles/japan-summer-festivals.xu.html?label=gen173nr-1FCAEoggI46AdIM1gEaJcCiAEBmAExuAEXyAEP2AEB6AEB-AECiAIBqAIDuAKWoN-nBsACAdICJDJjOTA4MzNiLTllNmItNDNlZS1iM2I0LTE4NGZkMWM3N2VmNtgCBeACAQ&from_articles_widget=1&force_lang=en-us",
"https://www.booking.com/articles/best-hotels-los-angeles.xu.html?label=gen173nr-1FCAEoggI46AdIM1gEaJcCiAEBmAExuAEXyAEP2AEB6AEB-AECiAIBqAIDuAKWoN-nBsACAdICJDJjOTA4MzNiLTllNmItNDNlZS1iM2I0LTE4NGZkMWM3N2VmNtgCBeACAQ&force_lang=en-us&"
]
sentences = []
# loop over the list
for url in urllist:
html = urlopen(url).read()
# clean up the webpage
soupified = BeautifulSoup(html, "html.parser")
# start with .get_text()
clean_text = soupified.get_text()
sentences.append(sent_tokenize(clean_text))
# include a pause time.sleep()
```
If the website requires clicking or expanded by "using the mouse" --> check out Selenium