# Processing Text ## Libraries ``` from urllib.request import urlopen from bs4 import BeautifulSoup import nltk import pandas as pd import re # if using datalore, put this at the top of notebook # if using your own computer, just run in console once nltk.download('punkt') import textacy.preprocessing as tprep def normalize(text): text = tprep.normalize.hyphenated_words(text) text = tprep.normalize.quotation_marks(text) text = tprep.normalize.unicode(text) text = tprep.remove.accents(text) text = tprep.replace.phone_numbers(text) text = tprep.replace.urls(text) text = tprep.replace.emails(text) text = tprep.replace.user_handles(text) text = tprep.replace.emojis(text) return text from spellchecker import SpellChecker # remember this is the package pyspellchecker import textacy # using your own computer import spacy nlp = spacy.load("en_core_web_sm") # datalore # if you want to use spacy in datalore import spacy #%% import subprocess #%% print(subprocess.getoutput("python -m spacy download en_core_web_sm")) # if you are using your own stuff start here # import spacy nlp = spacy.load("en_core_web_sm") from itertools import chain from collections import Counter import networkx as nx from matplotlib import pyplot as plt ``` ## Find Text As a class, we will find a text source to analyze. This text source usually will consist of a webpage or other dataset to examine and clean. Import the text into your report. ``` myurl = "https://www.npr.org/2023/11/08/1211487348/ivanka-trump-testimony-fraud-trial" html = urlopen(myurl).read() soupified = BeautifulSoup(html, "html.parser") #soupified text = soupified.get_text() text ``` ``` only_story = soupified.find("div", {"class": "storytext"}) text = only_story.get_text().strip() print(text) ``` ``` # count the tokens len(nltk.word_tokenize(text)) ``` If the text is one big long string, first break into sentence segments and store it in a Pandas DataFrame. ``` DF = pd.DataFrame(nltk.sent_tokenize(text), columns = ["text"]) DF ``` ## Fix Errors Examine the text for errors or problems by looking at the text. - look at the dataset using `View()` or in the notebook output. Use the “impurity” function from class to examine the text for potential issues. ``` RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]') # [] any of these ... [A-Z] means any alphabetic character a through z in caps # &, #, <>, {}, [], \ # so why it is \[\] ... brackets are special regex code ... so \ means NO LITERALLY THIS THING # proportion of the number of characters that are bad def impurity(dogs_bark, min_len=10): """returns the share of suspicious characters in a text""" if dogs_bark == None or len(text) < min_len: return 0 else: return len(RE_SUSPICIOUS.findall(text))/len(text) DF['score'] = DF['text'].apply(impurity) DF ``` Remove the noise with the regex function. Re-examine the impurity to determine if the data has been mostly cleaned. - NA we do not have any Normalize the rest of the text by using textacy. ``` DF['clean'] = DF['text'].apply(normalize) DF.head() ``` Examine spelling errors in at least one row of the dataset. - If you wanted to do the whole dataframe - Create a complete list of unique tokens from the text (paste together all the rows in the DF and then use `set()` to get a unique list) - Spell check all those - Create a dictionary `{}` of key-value pairs where the keys are the misspelled words and values are the corrected words - Replace using that dictionary and `re` ``` # create a spellchecker spell = SpellChecker() # find those words that may be misspelled # look at the first sentence of the data misspelled = spell.unknown(nltk.word_tokenize(DF['clean'][8])) for word in misspelled: # print the misspell print(word) # Get the one `most likely` answer print(spell.correction(word)) # Get a list of `likely` options print(spell.candidates(word)) ``` ## Pre-Processing Using spacy and textacy, pre-process the text to end up with a list of tokenized lists. ``` # creating a place to save the data output = [] # only the tagger and lemmatizer # nlp is the spacy english language model for doc in nlp.pipe(DF['clean'].to_list(), disable=["tok2vec", "ner", "parser"]): tokens = textacy.extract.words(doc, filter_stops = True, # default True, no stopwords filter_punct = True, # default True, no punctuation filter_nums = True, # default False, no numbers include_pos = None, # default None = include all exclude_pos = None, # default None = exclude none min_freq = 1) # minimum frequency of words output.append([str(word) for word in tokens]) # close output append output[0:1] ``` Create a frequency table of each of the tokens returned in this output. Below is some example code to get us started. ``` type(output[0][0]) Counter(chain.from_iterable(output)) ``` Summary (on your own) Write a paragraph explaining the process of cleaning data for an NLP pipeline. You should explain the errors you found in the dataset and how you fixed them. Explain the information that is gathered by using spacy and textacy and the final output. What did you learn from your frequency table? What is the text document about? # Information Extraction ## Libraries ``` # libraries import pysrt import pandas as pd import re # only data lore import spacy #%% import subprocess #%% print(subprocess.getoutput("python -m spacy download en_core_web_sm")) # everyone import spacy nlp = spacy.load('en_core_web_sm') import textacy ``` ## Snorkel Libraries ``` from snorkel.preprocess import preprocessor from snorkel.types import DataPoint from snorkel.labeling import labeling_function from snorkel.labeling import PandasLFApplier from itertools import combinations ``` ## Proposed Text - Import the text into your report from your proposal. - If the text is one big, long string, first break into sentence segments and store it in a Pandas DataFrame. ``` subs = pysrt.open("diehard.srt") DF = pd.DataFrame([ { "Text": sub.text, "Start": sub.start.seconds, "End": sub.end.seconds } for sub in subs]) DF.head() ``` ## Fix Errors - Examine the text for errors or problems by looking at the text. - Clean the data with examples from class (either impurity or example provided in NER section). ``` def remove_html(text): text = re.sub(r'<..?>', ' ', text) text = re.sub(r'♪', "", text) return(text) DF['clean'] = DF['Text'].apply(remove_html) DF.head() ``` ## Processing Text Summary - Write a paragraph explaining the process of cleaning data for a your NLP pipeline. You should explain the errors you found in the dataset and how you fixed them. Why did you think these things were important to fix for this project? ## Part of Speech Tagging - Tag your data with spacy’s part of speech tagger. - Convert this data into a Pandas DataFrame. - Use the dataframe to calculate the most common parts of speech. Since we have a dataframe of sentences, we have to: - Loop over each row - Then on each row, loop over each word - Use this system when the structure of the data matters, you want to retain the information about each row ``` spacy_tags = [] for row in DF['clean'].to_list(): for word in nlp(row): spacy_tags.append((str(word), str(word.tag_), str(word.pos_))) DF_tags = pd.DataFrame(spacy_tags, columns=['word', 'specific', 'universal']) DF_tags.head() ``` You could also paste everything together into one big text and use one loop - Use this system when the overall big picture is more important - And the text isn't so large that it crashes your system ``` all_text = " ".join(DF['clean'].to_list()) spacy_pos_tagged = [(str(word), str(word.tag_), str(word.pos_)) for word in nlp(all_text)] DF_tags = pd.DataFrame(spacy_pos_tagged, columns=['word', 'specific', 'universal']) DF_tags.head() ``` - What is the most common part of speech? https://github.com/explosion/spaCy/blob/master/spacy/glossary.py ``` DF_tags['universal'].value_counts() ``` - Do you see words that are multiple parts of speech? ``` pd.crosstab(DF_tags['word'], DF_tags['universal']) ``` Remember you can test that things are what you expect them to be: ``` type(DF_tags['word'][0]) ``` - What can you learn about the text from examining the most common nouns and verbs? ## KPE - Use textacy to find the key phrases in your text. ``` # build an english language for textacy pipe en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser")) # build a processor for textacy using spacy and process text doc = textacy.make_spacy_doc(all_text, lang = en) # text rank algorithm [kps for kps, weights in textacy.extract.keyterms.textrank(doc, normalize = "lemma", topn = 5)] ``` - If you get `AttributeError: module 'networkx' has no attribute 'pagerank_scipy'` - Install networkx==2.8.8 - `py_install("networtx<3.0")` - Using textacy utilities, combine like key phrases. ``` terms = set([term for term, weight in textacy.extract.keyterms.textrank(doc)]) print(textacy.extract.utils.aggregate_term_variants(terms)) ``` - What did you learn about your text by using keyphrase analysis? ## NER - Use spacy to extract named entities. ``` spacy_ner_tagged = [(ent, ent.label_) for ent in nlp(all_text).ents] DF_tags = pd.DataFrame(spacy_ner_tagged, columns=['word', 'entity']) DF_tags.head() ``` - Create a summary of your named entities. ``` DF_tags['entity'].value_counts() ``` - Apply Snorkel to your data to show any relationship between names. ### Getting Data into Format - Need two objects at a time to define relationships - So where are there two objects together? - First, we should process the dataframe to find every pairwise combination of entities that we are interested in. ``` # PERSON, GPE # put in the spacy codes for the entities types that you want ent_types = ["PERSON", "GPE", "NORP", "ORG"] # empty list to store the pairwise combinations stored_entities = [] # first get the entities, must be two for relationship matches # make a function that grabs them and puts them into a nice format def get_entities(x): """ Grabs the names using spacy's entity labeler """ # get all the entities in this row processed = nlp(x) # get the tokens for each sentence # list of all the words in the row of data tokens = [word.text for word in processed] # get all the entities - notice this is only for entity types that we defined temp = [(ent.text, ent.label_) for ent in processed.ents if ent.label_ in ent_types] # only move on if this row has at least two if len(temp) > 1: # finds all the combinations of pairs temp2 = list(combinations(temp, 2)) # for each pair combination for (person1, person2) in temp2: # find the names in the person 1 person1_words = [word.text for word in nlp(person1[0])] # find the token numbers for person 1 person1_ids = [i for i, val in enumerate(tokens) if val in person1_words] # output in (start, stop) token tuple format if len(person1_words) > 1: person1_ids2 = tuple(idx for idx in person1_ids) else: id_1 = [idx for idx in person1_ids] person1_ids2 = (id_1[0], id_1[0]) # do the same thing with person 2 person2_words = [word.text for word in nlp(person2[0])] person2_ids = [i for i, val in enumerate(tokens) if val in person2_words] if len(person2_words) > 1: person2_ids2 = tuple(idx for idx in person2_ids) else: id_2 = [idx for idx in person2_ids] person2_ids2 = (id_2[0], id_2[0]) # store all this in a list stored_entities.append( [x, # original text tokens, # tokens person1[0], # person 1 name person2[0], # person 2 name person1_ids2, # person 1 id token tuple person2_ids2 # person 2 id token tuple ]) DF['clean'].apply(get_entities) # create dataframe in snorkel structure DF_dev = pd.DataFrame(stored_entities, columns = ["sentence", "tokens", "person1", "person2", "person1_word_idx", "person2_word_idx"]) DF['clean'].apply(get_entities) # create dataframe in snorkel structure DF_dev = pd.DataFrame(stored_entities, columns = ["sentence", "tokens", "person1", "person2", "person1_word_idx", "person2_word_idx"]) ``` ### An example of how this works ``` ent_types = ["PERSON", "GPE", "NORP", "ORG"] processed = nlp("New York is a country, while French is a language, and Germany is another country.") tokens = [word.text for word in processed] tokens temp = [(ent.text, ent.label_) for ent in processed.ents if ent.label_ in ent_types] #print(temp) temp2 = list(combinations(temp, 2)) temp2 # for each pair combination for (person1, person2) in temp2: # find the names in the person 1 print(person1) print(person2) person1_words = [word.text for word in nlp(person1[0])] print(person1_words) # find the token numbers for person 1 person1_ids = [i for i, val in enumerate(tokens) if val in person1_words] print(person1_ids) # output in (start, stop) token tuple format if len(person1_words) > 1: person1_ids2 = tuple(idx for idx in person1_ids) print(person1_ids2) else: id_1 = [idx for idx in person1_ids] person1_ids2 = (id_1[0], id_1[0]) print(person1_ids2) person2_words = [word.text for word in nlp(person2[0])] print(person2_words) person2_ids = [i for i, val in enumerate(tokens) if val in person2_words] print(person2_ids) if len(person2_words) > 1: person2_ids2 = tuple(idx for idx in person2_ids) print(person2_ids2) else: id_2 = [idx for idx in person2_ids] person2_ids2 = (id_2[0], id_2[0]) print(person2_ids2) print('end example') ``` ### Grab the words in the middle and to the left - Figure out what the tokens are between the two objects ``` @preprocessor() def get_text_between(cand: DataPoint) -> DataPoint: """ Returns the text between the two person mentions in the sentence """ start = cand.person1_word_idx[1] + 1 end = cand.person2_word_idx[0] cand.between_tokens = " ".join(cand.tokens[start:end]) return cand ``` - Figure out the words to the left of each entity ``` @preprocessor() def get_left_tokens(cand: DataPoint) -> DataPoint: """ Returns tokens in the length 3 window to the left of the person mentions """ # TODO: need to pass window as input params window = 3 end = cand.person1_word_idx[0] cand.person1_left_tokens = cand.tokens[0:end][-1 - window : -1] end = cand.person2_word_idx[0] cand.person2_left_tokens = cand.tokens[0:end][-1 - window : -1] return cand ``` ### Now Start Using These - Get the dataframe into the right format to use these functions ``` # check the between words for our objects characters = {"marry", "married", "divorce", "brother", "sister", "father", "brothers", "sisters", "mother", "uncle", "aunt", "grandmother", "grandfather", "cousin", "kin"} locations = {","} pos_character = 1 neg_location = -1 no_relation = 0 @labeling_function(resources=dict(characters=characters), pre=[get_text_between]) def between_characters(x, characters): return pos_character if len(characters.intersection(set(x.between_tokens.lower()))) > 0 else no_relation @labeling_function(resources=dict(places=places), pre=[get_text_between]) def between_places(x, places): return neg_location if len(places.intersection(set(x.between_tokens.lower()))) > 0 else no_relation @labeling_function(resources=dict(characters=characters), pre=[get_left_tokens]) def left_characters(x, characters): if len(set(characters).intersection(set(x.person1_left_tokens))) > 0: return pos_character elif len(set(characters).intersection(set(x.person2_left_tokens))) > 0: return pos_character else: return no_relation @labeling_function(resources=dict(places=places), pre=[get_left_tokens]) def left_places(x, places): if len(set(places).intersection(set(x.person1_left_tokens))) > 0: return neg_location elif len(set(places).intersection(set(x.person2_left_tokens))) > 0: return neg_location else: return no_relation ``` ``` # list all your labeling functions lfs = [ between_characters, between_places, left_characters, left_places ] # create the apply pipeline applier = PandasLFApplier(lfs) # actually apply the thing L_dev = applier.apply(DF_dev) L_dev = pd.DataFrame(L_dev, columns=["char1", "place1", "char2", "place2"]) L_dev DF_combo = pd.concat([DF_dev, L_dev], axis = 1) DF_combo['character'] = DF_combo['char1'] + DF_combo['char2'] DF_combo['place'] = DF_combo['place1'] + DF_combo['place2'] DF_combo ``` - What kinds of relationships did you explore? Did you find any? Knowledge Graphs - Create a co-occurrence graph of the entities linked together in your text. UPDATE: make this from snorkel! ``` DF_characters = DF_combo[ DF_combo['character'] > 0 ] DF_locations = DF_combo[ DF_combo['place'] < 0 ] ``` ``` DF_characters = DF_characters[["person1", "person2", "character"]] co_character = DF_characters.groupby(['person1', 'person2']).agg('count').reset_index().rename(columns = {"character": "weight"}) co_character ``` ``` graph = nx.from_pandas_edgelist( co_character[['person1', 'person2', 'weight']], source='person1', target='person2', edge_attr=True) pos = nx.kamada_kawai_layout(graph, weight='weight') _ = plt.figure(figsize=(10, 10)) nx.draw(graph, pos, node_size=1000, node_color='skyblue', alpha=0.8, with_labels = True) plt.title('Graph Visualization', size=15) for (node1,node2,data) in graph.edges(data=True): width = data['weight'] _ = nx.draw_networkx_edges(graph,pos, edgelist=[(node1, node2)], width=width, edge_color='#505050', alpha=0.5) plt.show() plt.close() ``` Summary IE Write a summary of the results from your information extraction. What did you learn about your text? What sort of relationships and entities did you find in the text? What might you consider adding? # Text Summarization ## Libraries ``` from urllib.request import urlopen from bs4 import BeautifulSoup import nltk from nltk.tokenize import BlanklineTokenizer import pandas as pd from sentence_transformers import SentenceTransformer # install faiss-cpu import faiss import time # for summarization from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.text_rank import TextRankSummarizer # to work with topic models from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from gensim.models import LdaModel from gensim.corpora import Dictionary import nltk from rouge_score import rouge_scorer def print_rouge_score(rouge_score): for k,v in rouge_score.items(): print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure)) #tokenize, remove stopwords, non-alphabetic words, lowercase def preprocess(textstring): stops = set(stopwords.words('english')) tokens = word_tokenize(textstring) return [token.lower() for token in tokens if token.isalpha() and token not in stops] import pyLDAvis import pyLDAvis.gensim_models #don't skip this import matplotlib.pyplot as plt ``` ## Find Text ``` myurl = "https://www.gutenberg.org/files/46/46-h/46-h.htm" html = urlopen(myurl).read() soupified = BeautifulSoup(html, "html.parser") book = soupified.get_text() book[0:200] ``` ## Break Down Text ``` # break down the text paras = BlanklineTokenizer().tokenize(book) # put in DF DF = pd.DataFrame(paras, columns = ['paragraphs']) ``` ## Build the Search Engine ``` # creates the embeddings - this is converting your text into numbers based on the model that you pick for the sentence transformer model = SentenceTransformer('msmarco-MiniLM-L-12-v3') model_embedded = model.encode(paras) #model.save("saved_model") # Create an index using FAISS # figure out which sentence is the best match # you get the number back, so this index tells you which original sentence it was index = faiss.IndexFlatL2(model_embedded.shape[1]) index.add(model_embedded) faiss.write_index(index, 'index_christmas_carol') # you would do the above to set this up and run only once # unless you got new documents # you would also need paragraphs loaded for this to work (paras) # THEN to put the model into production, you would only need to load this part index = faiss.read_index('index_christmas_carol') #model.load("saved_model") ``` ## Define Search Function ``` # define a search def search(query, k = 5): # looks at the start time t=time.time() # puts the search into the same vector numbers that the # results to show are in (embeds the search in the # same way as the original text) query_vector = model.encode([query]) # return the top number of matches top_k = index.search(query_vector, k) # also show the time this took time.time() is current, # t is start time print('totaltime: {}'.format(time.time()-t)) # return the original text here return [paras[_id] for _id in top_k[1].tolist()[0]] ``` ## Use the Search Function ``` search("christmas feelings", k = 3) search("bad time", k = 3) # add your own ``` - Be sure to answer the question! ## Human Summary ``` "A Christmas Carol by Charles Dickens is a classic novella that tells the story of Ebenezer Scrooge, a miserly and selfish old man. The narrative unfolds on Christmas Eve when Scrooge is visited by the ghost of his former business partner, Jacob Marley, who warns him of the consequences of his greed and selfishness in the afterlife. Throughout the night, Scrooge is visited by three more ghosts – the Ghost of Christmas Past, the Ghost of Christmas Present, and the Ghost of Christmas Yet to Come (or the Ghost of Christmas Future). These spectral visits take Scrooge on a journey through his own life, revealing the events that shaped his bitter personality and the impact of his actions on those around him. Witnessing scenes of joy and sorrow, Scrooge begins to understand the true meaning of Christmas and the importance of compassion, generosity, and love. Overwhelmed with remorse, he vows to change his ways and embrace the spirit of the holiday. On Christmas morning, Scrooge wakes up a transformed man, filled with newfound kindness and generosity. He becomes a benefactor to those in need, rekindles relationships with his estranged family, and becomes a beloved member of the community. The story serves as a powerful allegory for redemption, emphasizing the transformative power of love and the capacity for change, even in the most hardened hearts." ``` ## LSA Summary ``` num_summary_sentence = 10 LANGUAGE = "english" stemmer = Stemmer(LANGUAGE) parser = PlaintextParser.from_string(book, Tokenizer(LANGUAGE)) summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) lsa_sents = [] for sentence in summarizer(parser.document, num_summary_sentence): lsa_sents.append(str(sentence)) lsa_summary = " ".join(lsa_sents) lsa_summary ``` ## Text Rank ``` # parser is the same as lsa summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) tr_sents = [] for sentence in summarizer(parser.document, num_summary_sentence): tr_sents.append(str(sentence)) tr_summary = " ".join(tr_sents) tr_summary ``` ## Topic Model ``` # Create a dictionary representation of the documents. # first create sentences import nltk sentences = nltk.sent_tokenize(book) # preprocess those sentences processed_sentences = [preprocess(sent) for sent in sentences] # create a list of ids to words (0 is cat, 1 is cheese) dictionary = Dictionary(processed_sentences) # create the term by document bag of words corpus = [dictionary.doc2bow(sent) for sent in processed_sentences] # Train the topic model LDAmodel = LdaModel(corpus = corpus, id2word = dictionary, iterations = 400, num_topics = 10, random_state = 5494, update_every = 1, chunksize = 100, passes = 10, alpha = 'auto', per_word_topics = True) # get the probability of each sentence's topics probs = [LDAmodel.get_document_topics(sentence) for sentence in corpus] save_probs = [] i = 0 # this assumes topic 0 is the one you want # get only topic zero probabilities for each sentence for document in probs: for (topic, prob) in document: if topic == 0: save_probs.append((sentences[i], prob)) i = i + 1 # put that into a data frame DF = pd.DataFrame(save_probs, columns = ["sentence", "prob"]) # take the top X probability sentences topic_summary = " ".join(DF.sort_values(by = ["prob"], ascending = False)[0:num_summary_sentence].sentence) topic_summary ``` ## Rouge ``` scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) lsa_scores = scorer.score(human_cc, lsa_summary) print_rouge_score(lsa_scores) tr_scores = scorer.score(human_cc, tr_summary) print_rouge_score(tr_scores) topic_scores = scorer.score(human_cc, topic_summary) print_rouge_score(topic_scores) ``` ## Topic Visualization ``` vis = pyLDAvis.gensim_models.prepare(LDAmodel, corpus, dictionary, n_jobs = 1) pyLDAvis.save_html(vis, 'LDA_Visualization.html') ##saves the file ``` # Classification ## Libraries ``` import pandas as pd import nltk nltk.download("stopwords") # only once for your own computer, leave it for datalore nltk.download('punkt') from nltk.corpus import stopwords mystopwords = set(stopwords.words("english")) from nltk.stem.porter import PorterStemmer stemmer = PorterStemmer() from string import punctuation import re from sklearn.model_selection import train_test_split def clean_up(text): #text = text.encode("utf-8") text = str(text).lower() text = [str(word) for word in nltk.word_tokenize(str(text)) if word not in mystopwords and word not in punctuation] text = [stemmer.stem(str(word)) for word in text] text = " ".join(text) text = re.sub("'[a-z]", "", text) text = re.sub("'", "", text) text = re.sub("’[a-z]", "", text) text = re.sub("’", "", text) return(text) from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer import gensim from gensim.models import Word2Vec from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report from lime.lime_text import LimeTextExplainer import eli5 from matplotlib import pyplot as plt from sklearn.pipeline import make_pipeline import numpy as np def document_vectorizer(corpus, model, num_features): vocabulary = set(model.wv.index_to_key) def average_word_vectors(words, model, vocabulary, num_features): feature_vector = np.zeros((num_features,), dtype="float64") nwords = 0. for word in words: if word in vocabulary: nwords = nwords + 1. feature_vector = np.add(feature_vector, model.wv[word]) if nwords: feature_vector = np.divide(feature_vector, nwords) return feature_vector features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus] return np.array(features) ``` ## Import Text ``` # import the data DF = pd.read_csv("amazon_alexa.tsv", sep='\t') DF # do not want empty or small cells in the DV DF['rating'].value_counts() ``` ### Create a "balanced" dataset ``` # create a balanced dataset temp5_DF = DF[DF['rating'] == 5].sample(n = 455, random_state=8548) temp5_DF['rating'].value_counts() balanced_DF = pd.concat([temp5_DF, DF[DF['rating'] != 5]]) balanced_DF['rating'].value_counts() ``` Set up Text - Do preprocessing on your text to prepare it for the final machine learning model. - What items do you think will be important in your preprocessing to clean? - 2) Lower case the data - 3) Remove stop words - 1) Unicode - 4) Stemming ``` DF = DF[DF['verified_reviews'] != ""] DF['clean'] = DF['verified_reviews'].apply(clean_up) DF = DF[DF['clean'] != ""] DF ``` ``` balanced_DF = balanced_DF[balanced_DF['verified_reviews'] != ""] balanced_DF['clean'] = balanced_DF['verified_reviews'].apply(clean_up) balanced_DF = balanced_DF[balanced_DF['clean'] != ""] balanced_DF ``` ### Split the data up ``` # on the left hand side of the equals: # predictor data is X and the classification labels are Y X_train, X_test, Y_train, Y_test = train_test_split(DF['clean'], # X values DF['rating'], # Y values test_size = 0.2, # test size random_state = 895302, # random shuffle stratify = DF['rating']) print('Size of Training Data ', X_train.shape[0]) print('Size of Test Data ', X_test.shape[0]) ``` ``` X_train_b, X_test_b, Y_train_b, Y_test_b = train_test_split(balanced_DF['clean'], # X values balanced_DF['rating'], # Y values test_size = 0.2, # test size random_state = 543543, # random shuffle stratify = balanced_DF['rating']) print('Size of Training Data ', X_train_b.shape[0]) print('Size of Test Data ', X_test_b.shape[0]) ``` ## Create Feature Extractions - Create a “one-hot” encoding using the count vectorizer and binary options. ``` oh_un = CountVectorizer(binary=True) oh_b = CountVectorizer(binary=True) ``` - Create the bag of words encoding using the count vectorizer. ``` bow_un = CountVectorizer() bow_b = CountVectorizer() bow_train_un = bow_un.fit_transform(X_train) bow_test_un = bow_un.transform(X_test) print(bow_train_un.shape) print(bow_test_un.shape) bow_train_b = bow_b.fit_transform(X_train_b) bow_test_b = bow_b.transform(X_test_b) print(bow_train_b.shape) print(bow_test_b.shape) ``` - Create the TF-IDF normalization using the tfidf vectorizer. ``` tf_un = TfidfVectorizer() tf_b = TfidfVectorizer() tf_train_un = tf_un.fit_transform(X_train) tf_test_un = tf_un.transform(X_test) print(tf_train_un.shape) print(tf_test_un.shape) tf_train_b = tf_b.fit_transform(X_train_b) tf_test_b = tf_b.transform(X_test_b) print(tf_train_b.shape) print(tf_test_b.shape) ``` - Create two word2vec models: - Using cbow and skipgram embeddings. - Using a 5 window size, with a large number of dimensions close to the size of the bag of words. ``` model_un_cbow = Word2Vec(X_train, vector_size = 2500, #dimensions window = 5, #window size sg = 0, #cbow min_count = 1, workers = 4) wv_train_un_cbow = document_vectorizer(X_train, model_un_cbow, 2500) wv_test_un_cbow = document_vectorizer(X_test, model_un_cbow, 2500) print(wv_train_un_cbow.shape) print(wv_test_un_cbow.shape) ``` ``` # word 2 vec model_b_cbow = Word2Vec(X_train_b, vector_size = 2500, #dimensions window = 5, #window size sg = 0, #cbow min_count = 1, workers = 4) wv_train_b_cbow = document_vectorizer(X_train_b, model_b_cbow, 2500) wv_test_b_cbow = document_vectorizer(X_test_b, model_b_cbow, 2500) print(wv_train_b_cbow.shape) print(wv_test_b_cbow.shape) ``` ``` model_un_sg = Word2Vec(X_train, vector_size = 2500, #dimensions window = 5, #window size sg = 1, #sg min_count = 1, workers = 4) wv_train_un_sg = document_vectorizer(X_train, model_un_sg, 2500) wv_test_un_sg = document_vectorizer(X_test, model_un_sg, 2500) print(wv_train_un_sg.shape) print(wv_test_un_sg.shape) ``` ``` model_b_sg = Word2Vec(X_train_b, vector_size = 2500, #dimensions window = 5, #window size sg = 1, #sg min_count = 1, workers = 4) wv_train_b_sg = document_vectorizer(X_train_b, model_b_sg, 2500) wv_test_b_sg = document_vectorizer(X_test_b, model_b_sg, 2500) print(wv_train_b_sg.shape) print(wv_test_b_sg.shape) ``` ## Summary of Pipeline 1) Import the data 2) Clean the data (the predictor text) 3) Split into test-train 4a) Build a blank feature extractor 4b) Extract the feature matrix (`fit_transform`, `transform`) a) one hot b) bag of words c) tf-idf d) word2vec (obviously more set up here to get the matrix out) 5a) Build a blank predictor (algorithm) 5b) Fit the training data to the model 6) Assess the model on the test data # Classification Part 2 - Use at least two classification algorithms to predict the outcome of the data. - Include the model assessment of these predictions for all models. ## Log Unbalanced ``` # logistic regression unbalanced data # one hot logreg = LogisticRegression() # blank model logreg.fit(oh_train_un, Y_train) # fit to the data (X, Y) y_log = logreg.predict(oh_test_un) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_log)) # print the report ``` ``` # bow logreg = LogisticRegression() # blank model logreg.fit(bow_train_un, Y_train) # fit to the data (X, Y) y_log = logreg.predict(bow_test_un) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_log)) # print the report ``` ``` # tfidf logreg = LogisticRegression() # blank model logreg.fit(tf_train_un, Y_train) # fit to the data (X, Y) y_log = logreg.predict(tf_test_un) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_log)) # print the report ``` - Models that NEVER predict one of the categories are not good ``` # w2v cbow logreg = LogisticRegression() # blank model logreg.fit(wv_train_un_cbow, Y_train) # fit to the data (X, Y) y_log = logreg.predict(wv_test_un_cbow) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_log)) # print the report ``` ``` # w2v skip gram logreg = LogisticRegression() # blank model logreg.fit(wv_train_un_sg, Y_train) # fit to the data (X, Y) y_log = logreg.predict(wv_test_un_sg) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_log)) # print the report ``` ## Log Balances ``` # log balanced data # one hot logreg = LogisticRegression() # blank model logreg.fit(oh_train_b, Y_train_b) # fit to the data (X, Y) y_log = logreg.predict(oh_test_b) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_log)) # print the report ``` ``` # bow logreg = LogisticRegression() # blank model logreg.fit(bow_train_b, Y_train_b) # fit to the data (X, Y) y_log = logreg.predict(bow_test_b) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_log)) # print the report ``` ``` # tfidf logreg = LogisticRegression() # blank model logreg.fit(tf_train_b, Y_train_b) # fit to the data (X, Y) y_log = logreg.predict(tf_test_b) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_log)) # print the report ``` ``` # wv cbow logreg = LogisticRegression() # blank model logreg.fit(wv_train_b_cbow, Y_train_b) # fit to the data (X, Y) y_log = logreg.predict(wv_test_b_cbow) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_log)) # print the report ``` ``` # wv sg logreg = LogisticRegression() # blank model logreg.fit(wv_train_b_sg, Y_train_b) # fit to the data (X, Y) y_log = logreg.predict(wv_test_b_sg) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_log)) # print the report ``` ## Bayes Unbalanced ``` # bayes unbalanced data # one hot nb = MultinomialNB() # blank model nb.fit(oh_train_un, Y_train) # fit to the data (X, Y) y_nb = nb.predict(oh_test_un) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_nb)) # print the report ``` ``` # bow nb = MultinomialNB() # blank model nb.fit(bow_train_un, Y_train) # fit to the data (X, Y) y_nb = nb.predict(bow_test_un) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_nb)) # print the report ``` ``` # tfidf nb = MultinomialNB() # blank model nb.fit(tf_train_un, Y_train) # fit to the data (X, Y) y_nb = nb.predict(tf_test_un) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_nb)) # print the report ``` ## Issues with Bayes - ValueError: Negative values in data passed to MultinomialNB (input X) - TFIDF, BOW, One Hot all return positive values in the matrices - W2V does not, it can have negative values - Bayes does not like negative values - Simple solution add a constant to the matrix ``` print(wv_test_b_cbow.min()) print(wv_test_b_sg.min()) print(wv_train_b_cbow.min()) print(wv_train_b_sg.min()) print(wv_test_un_cbow.min()) print(wv_test_un_sg.min()) print(wv_train_un_cbow.min()) print(wv_train_un_sg.min()) ``` ``` wv_test_b_cbow = wv_test_b_cbow + 1 wv_test_un_cbow = wv_test_un_cbow + 1 wv_train_b_cbow = wv_train_b_cbow + 1 wv_train_un_cbow = wv_train_un_cbow + 1 wv_test_b_sg = wv_test_b_cbow + 1 wv_test_un_sg = wv_test_un_cbow + 1 wv_train_b_sg = wv_train_b_sg + 1 wv_train_un_sg = wv_train_un_sg + 1 ``` ``` # w2v cbow nb = MultinomialNB() # blank model nb.fit(wv_train_un_cbow, Y_train) # fit to the data (X, Y) y_nb = nb.predict(wv_test_un_cbow) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_nb)) # print the report ``` ``` # w2v skip gram nb = MultinomialNB() # blank model nb.fit(wv_train_un_sg, Y_train) # fit to the data (X, Y) y_nb = nb.predict(wv_test_un_sg) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_nb)) # print the report ``` ### Bayes Balanced Data ``` # Bayes balanced data # one hot nb = MultinomialNB() # blank model nb.fit(oh_train_b, Y_train_b) # fit to the data (X, Y) y_nb = nb.predict(oh_test_b) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_nb)) # print the report ``` ``` # bow nb = MultinomialNB() # blank model nb.fit(bow_train_b, Y_train_b) # fit to the data (X, Y) y_nb = nb.predict(bow_test_b) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_nb)) # print the report ``` ``` # tfidf nb = MultinomialNB() # blank model nb.fit(tf_train_b, Y_train_b) # fit to the data (X, Y) y_nb = nb.predict(tf_test_b) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_nb)) # print the report ``` ``` # wv cbow nb = MultinomialNB() # blank model nb.fit(wv_train_b_cbow, Y_train_b) # fit to the data (X, Y) y_nb = nb.predict(wv_test_b_cbow) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_nb)) # print the report ``` ``` # wv sg nb = MultinomialNB() # blank model nb.fit(wv_train_b_sg, Y_train_b) # fit to the data (X, Y) y_nb = nb.predict(wv_test_b_sg) # predict test data (X) print(classification_report(y_true=Y_test_b, y_pred=y_nb)) # print the report ``` ## General Criteria 1) Make sure all categories are at least predicted (watch the zero division error) 2) Look for the higher accuracies 3) Then look at each individual F1 scores and pick the model that gets the best of the categories --> dependent on goals Write a paragraph summarizing the results from your comparisons. What models are best? Are there any general differences/similarities in prediction you see? How well is each category label classified? What might you do to make the model better? ANSWER THE QUESTION ## Examine the results - Take the best model - Examine "really right" and "really wrong" ``` # bow logreg = LogisticRegression() # blank model logreg.fit(bow_train_un, Y_train) # fit to the data (X, Y) y_log = logreg.predict(bow_test_un) # predict test data (X) print(classification_report(y_true=Y_test, y_pred=y_log)) # print the report ``` ``` # get literal probabilities probs = pd.DataFrame(logreg.predict_proba(bow_test_un), columns = ["p1", "p2", "p3", "p4", "p5"]) # add original answers probs['answer'] = Y_test.reset_index(drop = True) # add the original text probs['text'] = X_test.reset_index(drop=True) # add predicted answer probs['predicted_cat'] = y_log probs right_answers = probs[probs['answer'] == probs['predicted_cat']] wrong_answers = probs[probs['answer'] != probs['predicted_cat']] wrong_answers.reset_index() ``` ## Lime ``` # build a blank explainer model explainer = LimeTextExplainer(class_names = Y_test.sort_values().unique()) # build a pipeline for predicting instances pipeline = make_pipeline(bow_un, logreg) exp = explainer.explain_instance(wrong_answers.iloc[90]["text"], pipeline.predict_proba, num_features=10) exp.as_pyplot_figure() plt.show() exp.save_to_file('example.html') ``` ``` eli5.show_weights(estimator = logreg, top = 20, feature_names = bow_un.get_feature_names_out()) ``` Interpret the results by writing a paragraph explaining the output from this package. ANSWER THIS QUESTION # Chatbots - Actions.py: can we set up python to run what we want and then integrate into actions.py ``` import pandas as pd DF = pd.read_csv("recommend_db.csv") user_input = "drama" user_input = user_input.lower() DF['recommend'][DF['category'] == user_input].to_list()[0] ``` NLU file ``` version: "3.1" nlu: - intent: greet examples: | - hey - hello - hi - hello there - good morning - good evening - moin - hey there - let's go - hey dude - goodmorning - goodevening - good afternoon - intent: goodbye examples: | - cu - good by - cee you later - good night - bye - goodbye - have a nice day - see you around - bye bye - see you later - intent: affirm examples: | - yes - y - indeed - of course - that sounds good - correct - intent: deny examples: | - no - n - never - I don't think so - don't like that - no way - not really - intent: mood_great examples: | - perfect - great - amazing - feeling like a king - wonderful - I am feeling very good - I am great - I am amazing - I am going to save the world - super stoked - extremely good - so so perfect - so good - so perfect - intent: mood_unhappy examples: | - my day was horrible - I am sad - I don't feel very well - I am disappointed - super sad - I'm so sad - sad - very sad - unhappy - not good - not very good - extremly sad - so saad - so sad - intent: bot_challenge examples: | - are you a bot? - are you a human? - am I talking to a bot? - am I talking to a human? - intent: get_recommend examples: | - recommend a movie - I need a recommendation - What should I watch? - Please recommend something to watch - What should I watch? - I am bored - intent: recommend_type examples: | - [drama](user_input) - [reality](user_input) - [sci-fi]](user_input) - [romcom](user_input) - [anime](user_input) - [crime](user_input) - intent: thanks examples: | - thank you - thank you very much - thanks - great, thanks ``` DOMAIN ``` version: "3.1" intents: - greet - goodbye - affirm - deny - mood_great - mood_unhappy - bot_challenge - get_recommend - recommend_type - thanks entities: - user_input slots: user_input: type: text mappings: - type: from_entity entity: user_input actions: - action_get_recommendation responses: utter_greet: - text: "Hey! How are you?" utter_cheer_up: - text: "Here is something to cheer you up:" image: "https://i.imgur.com/nGF1K8f.jpg" utter_did_that_help: - text: "Did that help you?" utter_happy: - text: "Great, carry on!" utter_goodbye: - text: "Bye" utter_iamabot: - text: "I am a bot, powered by Rasa." utter_ask_type: - text: "What type of recommendation would you like? Please pick from drama, reality, sci-fi, romcom, anime, or crime." utter_look_up: - text: "Please wait while I look up a recommendation for {user_input} for you!" utter_you_are_welcome: - text: "You are welcome ^^" session_config: session_expiration_time: 60 carry_over_slots_to_new_session: true ``` STORY ``` version: "3.1" stories: - story: happy path steps: - intent: greet - action: utter_greet - intent: mood_great - action: utter_happy - story: sad path 1 steps: - intent: greet - action: utter_greet - intent: mood_unhappy - action: utter_cheer_up - action: utter_did_that_help - intent: affirm - action: utter_happy - story: sad path 2 steps: - intent: greet - action: utter_greet - intent: mood_unhappy - action: utter_cheer_up - action: utter_did_that_help - intent: deny - action: utter_goodbye - story: ask for a recommendationd steps: - intent: greet - action: utter_greet - intent: get_recommend - action: utter_ask_type - intent: recommend_type - action: utter_look_up - action: action_get_recommendation - intent: thanks - action: utter_you_are_welcome ``` ACTIONS ``` # This files contains your custom actions which can be used to run # custom Python code. # # See this guide on how to implement these action: # https://rasa.com/docs/rasa/custom-actions # This is a simple example for a custom action which utters "Hello World!" from typing import Any, Text, Dict, List from rasa_sdk import Action, Tracker from rasa_sdk.executor import CollectingDispatcher import pandas as pd DF = pd.read_csv("recommend_db.csv") class ActionRecommendation(Action): def name(self) -> Text: return "action_get_recommendation" def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: user_input = tracker.get_slot('user_input') try: user_input = user_input.lower() filter_output = DF['recommend'][DF['category'] == user_input].to_list()[0] if (len(output) > 0): output = f"I would recommend {filter_output} as a good {user_input} option." else: output = "Please only use the categories listed." except: output = 'An unexpected error has occurred.' dispatcher.utter_message(text=output) return [] ```