ANLY 520 Fall - HackMD

## Libraries ``` import pysrt import pandas as pd import textacy.preprocessing as tprep import re import faiss from sentence_transformers import SentenceTransformer import time import gensim from nltk.corpus import stopwords from gensim.models import LdaModel from gensim.corpora import Dictionary from sumy.summarizers.text_rank import TextRankSummarizer from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words from rouge_score import rouge_scorer ``` ## Import Data ``` subs = pysrt.open("frozen.srt") DF = pd.DataFrame([ { "Text": sub.text, "Start": sub.start.seconds, "End": sub.end.seconds } for sub in subs]) DF.head() ``` ## Clean Up Text ``` def normalize(text): text = tprep.normalize.hyphenated_words(text) text = tprep.normalize.quotation_marks(text) text = tprep.normalize.unicode(text) text = tprep.remove.accents(text) text = re.sub("\\n", " ", text) return text DF["Normalized"] = DF["Text"].apply(normalize) DF["Normalized"] ``` ## Create Search Engine ``` # Load a pre-trained model model = SentenceTransformer('msmarco-MiniLM-L-12-v3') search_data = DF["Normalized"].tolist() # embed frozen search_data_embds = model.encode(search_data) # Create an index using FAISS index = faiss.IndexFlatL2(search_data_embds.shape[1]) index.add(search_data_embds) faiss.write_index(index, 'index_search_data') index = faiss.read_index('index_search_data') # define a search def search(query, k): t=time.time() query_vector = model.encode([query]) top_k = index.search(query_vector, k) print('totaltime: {}'.format(time.time()-t)) return [search_data[_id] for _id in top_k[1].tolist()[0]] ``` ## Search ``` print(search("machine", 5)) print(search("what is this movie about", 5)) # a couple more on your own # be sure to answer the question from the assignment ``` ## Summarization - Create a human summary of the text. - Create text summaries using LSA, TextRank, and Topic Modeling. - Assess those summaries using the Rouge-N analyzer. - Which summary was the best when compared to the human summary? ### Topic Modeling ``` # Create a dictionary representation of the documents. # load the list of documents processed_sentences = [nltk.word_tokenize(row) for row in DF["Normalized"].to_list()] dictionary = Dictionary(processed_sentences) # Filter infrequent or too frequent words. dictionary.filter_extremes(no_below=10, no_above=0.5) corpus = [dictionary.doc2bow(summary) for summary in processed_sentences] ``` ``` # Train the topic model LDAmodel = LdaModel(corpus = corpus, id2word = dictionary, iterations = 400, num_topics = 10, random_state = 100, update_every = 1, chunksize = 100, passes = 10, alpha = 'auto', per_word_topics = True) top_topics = list(LDAmodel.top_topics(corpus)) print(top_topics) ``` ``` probs = [LDAmodel.get_document_topics(sentence) for sentence in corpus] save_probs = [] i = 0 for document in probs: for (topic, prob) in document: if topic == 0: #topic 0 is the best representation save_probs.append((DF["Normalized"].to_list()[i], prob)) i = i + 1 DF_two = pd.DataFrame(save_probs, columns = ["sentence", "prob"]) " ".join(DF_two.sort_values(by = ["prob"], ascending = False)[0:3]['sentence']) TMsummary = " ".join(DF_two.sort_values(by = ["prob"], ascending = False)[0:3]['sentence']) ``` ### Text Rank ``` # set up LANGUAGE = "english" stemmer = Stemmer(LANGUAGE) wholetext = " ".join(DF['Normalized'].to_list()) num_summary_sentence = 3 # make summary parser = PlaintextParser.from_string(wholetext, Tokenizer(LANGUAGE)) summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, num_summary_sentence): print (str(sentence)) TRsummary = ' '.join(str(sentence) for sentence in summarizer(parser.document, num_summary_sentence)) ``` ### Assessment ``` gold_standard = "Armed with every weapon they can get their hands on, the Expendables are the world's last line of defense and the team that gets called when all other options are off the table." def print_rouge_score(rouge_score): for k,v in rouge_score.items(): print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure)) ``` ``` scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) scores = scorer.score(gold_standard, TRsummary) print_rouge_score(scores) scores = scorer.score(gold_standard, TMsummary) print_rouge_score(scores) ``` # Classification ## Libraries ``` # feature extraction section import pandas as pd from nltk.corpus import stopwords import nltk nltk.download('stopwords') nltk.download("punkt") mystopwords = set(stopwords.words("english")) from string import punctuation from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() # machine learning part from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer ``` ## Functions ``` import gensim from gensim.models import Word2Vec #create flattening function import numpy as np def document_vectorizer(corpus, model, num_features): vocabulary = set(model.wv.index_to_key) def average_word_vectors(words, model, vocabulary, num_features): feature_vector = np.zeros((num_features,), dtype="float64") nwords = 0. for word in words: if word in vocabulary: nwords = nwords + 1. feature_vector = np.add(feature_vector, model.wv[word]) if nwords: feature_vector = np.divide(feature_vector, nwords) return feature_vector features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus] return np.array(features) ``` ### Algorithm Functions ``` # algorithm functions from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report import eli5 from lime.lime_text import LimeTextExplainer import matplotlib.pyplot as plt ``` ## Set up Text - Do preprocessing on your text to prepare it for the final machine learning model. - What items do you think will be important in your preprocessing to clean? ``` # be sure to update DF = pd.read_csv("small_data.csv") DF.head() ``` ``` def text_clean(text): # lower case text = text.lower() # numbers text = " ".join([word for word in nltk.word_tokenize(text) if not word.isdigit() and word not in punctuation]) # stop words text = " ".join([word for word in nltk.word_tokenize(text) if word not in mystopwords]) # stemming text = stemmer.stem(text) return text.strip() ``` ``` DF['Normalized'] = DF['text'].apply(text_clean) ``` Create Feature Extractions Create a “one-hot” encoding using the count vectorizer and binary options. Create the bag of words encoding using the count vectorizer. Create the TF-IDF normalization using the tfidf vectorizer. Create two word2vec models: Using 100 dimensions. Using cbow and skipgram embeddings. Using a 5 window size. ## Split the Data ``` # split the data x_train, x_test, y_train, y_test = train_test_split(DF['Normalized'], DF['humor'], test_size = .20, random_state = 4320) print(x_train.shape) print(x_test.shape) print(y_train.shape) print(y_test.shape) ``` ### One Hot ``` # one hot matrix onehot_vectorizer = CountVectorizer(binary=True) # convert the training data into words by documents matrix # only words seen during training will be useable oh_train = onehot_vectorizer.fit_transform(x_train) # makes this the same vocabulary and the same size oh_test = onehot_vectorizer.transform(x_test) print(oh_train.shape) print(oh_test.shape) print(oh_train[0:10]) ``` ### Bag of Words ``` # bag of words bow_vectorizer = CountVectorizer() # convert the training data into words by documents matrix # only words seen during training will be useable bow_train = bow_vectorizer.fit_transform(x_train) # makes this the same vocabulary and the same size bow_test = bow_vectorizer.transform(x_test) print(bow_train.shape) print(bow_test.shape) print(bow_train[0:10]) ``` ### TF-IDF ``` # tf-idf tf_vectorizer = TfidfVectorizer() # convert the training data into words by documents matrix # only words seen during training will be useable tf_train = tf_vectorizer.fit_transform(x_train) # makes this the same vocabulary and the same size tf_test = tf_vectorizer.transform(x_test) print(tf_train.shape) print(tf_test.shape) print(tf_train[0:10]) ``` ### Word2vec ``` # train the model on the training data only our_model = Word2Vec(x_train, vector_size = 1000, #dimensions window = 5, #window size sg = 0, #cbow min_count = 1, workers = 4) # generate averaged word vector features from word2vec model avg_wv_train_features = document_vectorizer(corpus = x_train, model = our_model, num_features = 1000) # generate averaged word vector features from word2vec model avg_wv_test_features = document_vectorizer(corpus = x_test, model = our_model, num_features = 1000) ``` ``` # train the model on the training data only our_model_sg = Word2Vec(x_train, vector_size = 1000, #dimensions window = 5, #window size sg = 1, #cbow min_count = 1, workers = 4) # generate averaged word vector features from word2vec model avg_wv_train_features_sg = document_vectorizer(corpus = x_train, model = our_model_sg, num_features = 1000) # generate averaged word vector features from word2vec model avg_wv_test_features_sg = document_vectorizer(corpus = x_test, model = our_model_sg, num_features = 1000) ``` ## Classification Models Classify Use at least two classification algorithms to predict the outcome of the data. ### SVM ``` oh_svm = LinearSVC() oh_svm.fit(oh_train, y_train) oh_y_svm = oh_svm.predict(oh_test) bow_svm = LinearSVC() bow_svm.fit(bow_train, y_train) bow_y_svm = bow_svm.predict(bow_test) tf_svm = LinearSVC() tf_svm.fit(tf_train, y_train) tf_y_svm = tf_svm.predict(tf_test) wv_c_svm = LinearSVC() wv_c_svm.fit(avg_wv_train_features, y_train) wv_c_y_svm = wv_c_svm.predict(avg_wv_test_features) wv_s_svm = LinearSVC() wv_s_svm.fit(avg_wv_train_features_sg, y_train) wv_s_y_svm = wv_s_svm.predict(avg_wv_test_features_sg) ``` ### Bayes ``` #bayes oh_bayes = MultinomialNB() oh_bayes.fit(oh_train, y_train) oh_y_bayes = oh_bayes.predict(oh_test) bow_bayes = MultinomialNB() bow_bayes.fit(bow_train, y_train) bow_y_bayes = bow_bayes.predict(bow_test) tf_bayes = MultinomialNB() tf_bayes.fit(tf_train, y_train) tf_y_bayes = tf_bayes.predict(tf_test) ``` - Deal with negatives in the wv output - Find min + to all matrices ``` avg_wv_train_features.min() avg_wv_test_features.min() avg_wv_train_features_sg.min() avg_wv_test_features_sg.min() avg_wv_train_features = avg_wv_train_features + 1 avg_wv_test_features = avg_wv_test_features + 1 avg_wv_train_features_sg = avg_wv_train_features_sg + 1 avg_wv_test_features_sg = avg_wv_test_features_sg + 1 ``` - Then run the algorithm. This is ONLY a bayes problem. ``` wv_c_bayes = MultinomialNB() wv_c_bayes.fit(avg_wv_train_features, y_train) wv_c_y_bayes = wv_c_bayes.predict(avg_wv_test_features) wv_s_bayes = MultinomialNB() wv_s_bayes.fit(avg_wv_train_features_sg, y_train) wv_s_y_bayes = wv_s_bayes.predict(avg_wv_test_features_sg) ``` Include the model assessment of these predictions for all models. ``` print(classification_report(y_test, oh_y_svm)) print(classification_report(y_test, bow_y_svm)) print(classification_report(y_test, tf_y_svm)) print(classification_report(y_test, wv_c_y_svm)) print(classification_report(y_test, wv_s_y_svm)) print(classification_report(y_test, oh_y_bayes)) print(classification_report(y_test, bow_y_bayes)) print(classification_report(y_test, tf_y_bayes)) print(classification_report(y_test, wv_c_y_bayes)) print(classification_report(y_test, wv_s_y_bayes)) ``` Write a paragraph summarizing the results from your comparisons. What models are best? Are there any general differences/similarities in prediction you see? How well is each category label classified? What might you do to make the model better? - accuracy - precision and recall second - simpler / computation faster models preferred - if all equal pick your favorite If you want, you can switch out one of the classification algorithms with a deep learning model. ### Undersampling ``` # undersample example DF_false = DF[DF['humor'] == False] DF_false.count() DF_true = DF[DF['humor'] == True] DF_true.count() # let's say df_true is the "big one" DF_true_small = DF_true.sample(n = 10000) # DF_true_small = DF_true.sample(n = DF_false.count()) DF_true_small.count() DF_small = pd.concat([DF_false, DF_true_small]) DF_small.count() # faster! use only this code DF_test = DF_small.groupby('humor').sample(10000) DF_test['humor'].value_counts() ``` ## Interpretation - Use eli5 to determine what predicts each category label. - Interpret the results by writing a paragraph explaining the output from this package. - estimator is the fitted model name (whatever you used MODEL.fit(DATA) with) - feature_names are from function that you used to create the training/testing data (NOT the saved data output) ... FUNCTION.fit_transform() - limitation: bayes not supported ``` eli5.show_weights(estimator = tf_svm, top = 10, feature_names = tf_vectorizer.get_feature_names_out()) ``` ``` explainer = LimeTextExplainer(class_names = y_train.sort_values().unique()) explainer from sklearn.pipeline import make_pipeline # import new data # use text_clean # use this pipeline to predict the new instance # into pipeline goes the functions to transform and model the data pipeline = make_pipeline(tf_vectorizer, tf_bayes) pipeline.predict_proba(["this is funny"]) #pipeline.predict(["this is funny"]) exp = explainer.explain_instance(DF['Normalized'][5], pipeline.predict_proba, num_features=10) exp.as_pyplot_figure() plt.show() exp.save_to_file('example.html') ``` # Chatbots - OpenSubtitles: https://opus.nlpl.eu/OpenSubtitles-v2018.php ## Find Text - As a class, let’s train a chatbot on a movie. Pick a set of subtitles for the chatbot. We are going to use Titanic. ## Training - After training the chatbot on basic English, use the movie and the list trainer to train the chatbot. ### Libraries - Be sure to install chatterbot-corpus - chatterbot version 1.0.0 - https://www.reddit.com/r/learnpython/comments/14o9tbw/sqlalchemy_update_error/?rdt=62748 - python-dateutil update to 2.8.2 in datalore ``` # packages from chatterbot import ChatBot from chatterbot.trainers import ChatterBotCorpusTrainer import pysrt import pandas as pd from chatterbot.trainers import ListTrainer ``` ### Training ``` # start a chatbot chatbot = ChatBot('Sad Leo') # Create a new trainer for the chatbot trainer = ChatterBotCorpusTrainer(chatbot) ``` ``` # Train the chatbot based on the english corpus trainer.train("chatterbot.corpus.english") # Train based on english greetings corpus trainer.train("chatterbot.corpus.english.greetings") # Train based on the english conversations corpus trainer.train("chatterbot.corpus.english.conversations") ``` ### Load Data ``` subs = pysrt.open("titanic.srt") DF = pd.DataFrame([ { "Text": sub.text, "Start": sub.start.seconds, "End": sub.end.seconds } for sub in subs]) DF.head() ``` ### Use data as training ``` conversation = [ "Hello", "Hi there!", "How are you doing?", "I'm doing great.", "That is good to hear", "Thank you.", "You're welcome." ] trainer = ListTrainer(chatbot) trainer.train(conversation) chatbot.get_response("Hello") chatbot.get_response("How are you doing?") ``` ## Test the Chatbot - Using lines from the movie, test the chatbot responses. But this is sad at new python ... so just submit it. ## Last Week Random Stuff ``` urllist = [ "https://www.booking.com/articles/japan-summer-festivals.xu.html?label=gen173nr-1FCAEoggI46AdIM1gEaJcCiAEBmAExuAEXyAEP2AEB6AEB-AECiAIBqAIDuAKWoN-nBsACAdICJDJjOTA4MzNiLTllNmItNDNlZS1iM2I0LTE4NGZkMWM3N2VmNtgCBeACAQ&from_articles_widget=1&force_lang=en-us", "https://www.booking.com/articles/best-hotels-los-angeles.xu.html?label=gen173nr-1FCAEoggI46AdIM1gEaJcCiAEBmAExuAEXyAEP2AEB6AEB-AECiAIBqAIDuAKWoN-nBsACAdICJDJjOTA4MzNiLTllNmItNDNlZS1iM2I0LTE4NGZkMWM3N2VmNtgCBeACAQ&force_lang=en-us&" ] sentences = [] # loop over the list for url in urllist: html = urlopen(url).read() # clean up the webpage soupified = BeautifulSoup(html, "html.parser") # start with .get_text() clean_text = soupified.get_text() sentences.append(sent_tokenize(clean_text)) # include a pause time.sleep() ``` If the website requires clicking or expanded by "using the mouse" --> check out Selenium