# ANLY 520 Late Spring # Preprocessing Text ## Libraries ``` import urllib import pandas as pd from bs4 import BeautifulSoup import nltk nltk.download('punkt') import re RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]') def impurity(text, min_len=10): """returns the share of suspicious characters in a text""" if text == None or len(text) < min_len: return 0 else: return len(RE_SUSPICIOUS.findall(text))/len(text) import textacy.preprocessing as tprep import spacy def normalize(text): text = tprep.normalize.hyphenated_words(text) text = tprep.normalize.quotation_marks(text) text = tprep.normalize.unicode(text) text = tprep.remove.accents(text) text = tprep.replace.phone_numbers(text) text = tprep.replace.urls(text) text = tprep.replace.emails(text) text = tprep.replace.user_handles(text) text = tprep.replace.emojis(text) text = text.lower() return text from spellchecker import SpellChecker spell = SpellChecker() import textacy import spacy # only cloud stuff #%% import subprocess #%% print(subprocess.getoutput("python -m spacy download en_core_web_sm")) # back to everyone nlp = spacy.load("en_core_web_sm") from itertools import chain from collections import Counter ``` - Regex notes: - Anything `[]` inside here can be included - `[a-z]` this would match any lower cased alpha character - `[&#<>{}\[\]\\]`: - Any of the following characters - & # <> {} - [] and \ but these are special characters so you have to escape them with `\` ## Find Text - As a class, we will find a text source to analyze. This text source usually will consist of a webpage or other dataset to examine and clean. - Import the text into your report. ``` url = "https://www.npr.org/2024/03/17/1239056873/devil-comet-solar-eclipse-12p-pons-brooks?utm_source=pocket-newtab-en-us" html = urllib.request.urlopen(url) type(html) html[0:100] ``` ``` # clean out the html get only the text soupified = BeautifulSoup(html, "html.parser") # start simple type(soupified) data = soupified.get_text().strip() type(data) print(data[0:100]) ``` - If the text is one big long string, first break into sentence segments and store it in a Pandas DataFrame. ``` # make this a data frame DF = pd.DataFrame( nltk.sent_tokenize(data), columns = ["text"] ) DF.head() ``` ## Fix Errors - Examine the text for errors or problems by looking at the text. Not a whole lot wrong here. - Use the “impurity” function from class to examine the text for potential issues. ``` DF['impurity'] = DF['text'].apply(impurity) DF ``` - Remove the noise with the regex function. Not necessary. - Re-examine the impurity to determine if the data has been mostly cleaned. Not necessary. - Normalize the rest of the text by using textacy. ``` DF['normalize'] = DF['text'].apply(normalize) ``` - Examine spelling errors in at least one row of the dataset. ``` # find those words that may be misspelled misspelled = spell.unknown(nltk.word_tokenize(DF['normalize'].iloc[1])) # or DF['normalize'][0] for word in misspelled: # what is the word print(word) # Get the one `most likely` answer print(spell.correction(word)) # Get a list of `likely` options print(spell.candidates(word)) print("----\n") ``` - How would I really use this? - Find all the tokens in the text `nltk.word_tokenize` on the whole text - Create a dictionary of commonly misspelled words and their fixes - Use `re` package to find and replace those common words and apply their fixes ## Pre-Processing - Using spacy and textacy, pre-process the text to end up with a list of tokenized lists. - List of tokenized lists is useful over a pandas dataframe because we don't know how long each sentence will be - Pandas requires things to be rectangular so that won't work ``` # need a spot to store stuff output = [] # only the tagger and lemmatizer for doc in nlp.pipe(DF['normalize'].to_list(), disable=["tok2vec", "ner", "parser"]): tokens = textacy.extract.words(doc, filter_stops = True, # default True, no stopwords filter_punct = True, # default True, no punctuation filter_nums = True, # default False, no numbers include_pos = None, # default None = include all exclude_pos = None, # default None = exclude none min_freq = 1) # minimum frequency of words output.append([str(pizza) for pizza in tokens]) # close output append ``` ``` print(type(output)) # whole thing is a list print(type(output[0])) # each item is a list print(type(output[0][1])) # each item within the list is a str # spacy tokens are ANNOYING ``` - Create a frequency table of each of the tokens returned in this output. Below is some example code to get us started. ``` count_dictionary = Counter(chain.from_iterable(output)) # example of a dictionary {'key': value pairs} count_dictionary.most_common(10) # by using most common, we get back a list [] of tuples () # list[0](1) = 4 not possible ``` ## Summary (on your own) Write a paragraph explaining the process of cleaning data for an NLP pipeline. You should explain the errors you found in the dataset and how you fixed them. Explain the information that is gathered by using spacy and textacy and the final output. What did you learn from your frequency table? What is the text document about? # Proposal of Text ``` # show how many tokens something is? import nltk # take the big long string of the document # before it's converted into pandas type(data) len(nltk.word_tokenize(data)) ``` # Information Extraction ## Libraries ``` import pysrt import pandas as pd import spacy import textacy # only for datalore import subprocess print(subprocess.getoutput("python -m spacy download en_core_web_sm")) # everyone nlp = spacy.load("en_core_web_sm") import re def clean_up(text): text = re.sub('</?[a-z]>', ' ', text) text = re.sub('♪', ' ', text) return text from snorkel.preprocess import preprocessor from snorkel.types import DataPoint from itertools import combinations from snorkel.labeling import labeling_function from snorkel.labeling import PandasLFApplier from itertools import combinations import math from tqdm import tqdm import networkx as nx from matplotlib import pyplot as plt ``` ## Import and Clean ``` # import and clean subs = pysrt.open("barbie.srt") DF = pd.DataFrame([ { "Text": sub.text, "Start": sub.start.seconds, "End": sub.end.seconds } for sub in subs]) DF.head() ``` ``` # cleaning DF['clean'] = DF['Text'].apply(clean_up) DF.head() ``` ## Part of Speech Tagging - Tag your data with spacy’s part of speech tagger. - Convert this data into a Pandas DataFrame. ``` # part of speech tagging whole_text = " ".join(DF['clean'].to_list()) spacy_pos_tagged = [(str(word), word.tag_, word.pos_) for word in nlp(whole_text)] DF_tags = pd.DataFrame(spacy_pos_tagged, columns = ['token', 'specific_pos', 'universal_pos']) DF_tags ``` - Use the dataframe to calculate the most common parts of speech. ``` DF_tags['universal_pos'].value_counts() ``` - Use the dataframe to calculate if words are considered more than one part of speech (crosstabs or groupby). ``` # print(type(DF_tags['token'].iloc[4])) # way to check types DF_cross_tabs = pd.crosstab(DF_tags['token'], DF_tags['universal_pos']) DF_cross_tabs['total_tags'] = DF_cross_tabs.astype(bool).sum(axis=1) DF_cross_tabs.sort_values('total_tags', ascending=False).head(10) ``` - What is the most common part of speech? - Do you see words that are multiple parts of speech? BE SURE TO ANSWER THE QUESTIONS ## KPE - Use textacy to find the key phrases in your text. - Using textacy utilities, combine like key phrases. ``` # py_install("networkx < 3.0", pip = T) # use with textacy version 0.12.0 and earlier # or pip install networkx < 3.0 for other solutions # textacy 0.13.0 requires python 3.9 # KPE # build an english language for textacy pipe en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser")) # build a processor for textacy using spacy and process text doc = textacy.make_spacy_doc(whole_text, lang = en) # text rank algorithm [kps for kps, weights in textacy.extract.keyterms.textrank(doc, normalize = "lemma", topn = 5)] ``` ``` terms = set([term for term, weight in textacy.extract.keyterms.textrank(doc)]) print(textacy.extract.utils.aggregate_term_variants(terms)) ``` - Do the outputs make sense given your text? ANSWER THIS QUESTION ## NER - Use spacy to extract named entities. - Create a summary of your named entities. ``` # ner using spacy spacy_ner_tagged = [(str(ent), ent.label_) for ent in nlp(whole_text).ents] DF_ner = pd.DataFrame(spacy_ner_tagged, columns = ['token', 'entity']) DF_ner.head() DF_ner['entity'].value_counts() ``` This is where you would improve spacy's model using the example code from the lecture Process with spacy NER is where this starts in the notes You would add the new entities, merging, co-reference resolution, etc. here before using it in the snorkel pipeline. This part would improve the extraction of named entities, do some named entity disambiguation, and create the best co-occurence pairs for using snorkel. - Apply Snorkel to your data to show any relationship between names. - What might you do to improve the default NER extraction? 1) figure out if the text pieces even have pairs of entities to label together - getting pairs of entities - getting the original text, the tokenized text, the entity names, the start/stop token for each entity ``` # snorkel # create an empty spot to save the data stored_entities = [] # first get the entities, must be two for relationship matches # create a function to grab the entities so we can use apply def get_entities(x): """ Grabs the names using spacy's entity labeler """ # get all the entities in this row processed = nlp(x) # get the tokens for each sentence tokens = [word.text for word in processed] # get all the entities temp = [(str(ent), ent.label_) for ent in processed.ents if ent.label_ != ""] # only move on if this row has at least two if len(temp) > 1: # finds all the combinations of pairs temp2 = list(combinations(temp, 2)) # for each pair combination for (entity1, entity2) in temp2: # find the words for entity 1 entity1_words = [word.text for word in nlp(entity1[0])] # find the token numbers for entity 1 entity1_ids = [i for i, val in enumerate(tokens) if val in entity1_words] if len(entity1_words) > 1: entity1_ids2 = tuple(idx for idx in entity1_ids[0:2]) else: id_1 = [idx for idx in entity1_ids] entity1_ids2 = (id_1[0], id_1[0]) # do the same thing with person 2 entity2_words = [word.text for word in nlp(entity2[0])] entity2_ids = [i for i, val in enumerate(tokens) if val in entity2_words[0:2]] if len(entity2_words) > 1: entity2_ids2 = tuple(idx for idx in entity2_ids[0:2]) else: id_2 = [idx for idx in entity2_ids] entity2_ids2 = (id_2[0], id_2[0]) # store all this in a list stored_entities.append( [x, # original text tokens, # tokens entity1[0], # entity 1 name entity2[0], # entity 2 name entity1_ids2, # entity 1 id token tuple entity2_ids2 # entity 2 id token tuple ]) # get_entities("Barbie and Ken went to the store, and Malibu bought a car.") # list [] # of tuples () with two items 0, 1 # each of those tuples is a tuple of two items (), 0, 1 # where 0 is the word/token, 1 is the entity ``` 2) pick a level you want to do this at - you need either sentences or paragraphs 3) apply the function to get the pairwise combinations of entities ``` # create an empty spot to save the data stored_entities = [] DF['clean'].apply(get_entities) # create dataframe in snorkel structure DF_dev = pd.DataFrame(stored_entities, columns = ["sentence", "tokens", "entity1", "entity2", "entity1_word_idx", "entity2_word_idx"]) ``` ``` DF_dev ``` 4) what do we want to use to label relationships? ``` # get words between the data points @preprocessor() def get_text_between(cand: DataPoint) -> DataPoint: """ Returns the text between the two person mentions in the sentence """ start = cand.entity1_word_idx[1] + 1 end = cand.entity2_word_idx[0] cand.between_tokens = cand.tokens[start:end] return cand # get words next to the data points @preprocessor() def get_left_tokens(cand: DataPoint) -> DataPoint: """ Returns tokens in the length 5 window to the left of the person mentions """ # TODO: need to pass window as input params window = 5 end = cand.entity1_word_idx[0] cand.entity1_left_tokens = cand.tokens[0:end][-1 - window : -1] end = cand.entity2_word_idx[0] cand.entity2_left_tokens = cand.tokens[0:end][-1 - window : -1] return cand ``` 5) what are the clues in those markers that I can use to label my relations? ``` # make this part up with words that are clues friend = {"Hi", "!", "friend", "thanks", "Thanks", "hi", "and"} # this part important is_friend = 1 not_friend = 0 @labeling_function(resources=dict(friend=friend), pre=[get_text_between]) def between_friend(x,friend): return is_friend if len(friend.intersection(set(x.between_tokens))) > 0 else not_friend @labeling_function(resources=dict(friend=friend), pre=[get_left_tokens]) def left_friend(x,friend): if len(friend.intersection(set(x.entity1_left_tokens))) > 0: return is_friend elif len(friend.intersection(set(x.entity2_left_tokens))) > 0: return is_friend else: return not_friend ``` 6) what do we get back? and how can we improve? ``` # create a list of functions to run     # lfs labeling functions  lfs = [ between_friend, left_friend ] # build the applier function  applier = PandasLFApplier(lfs) # run it on the dataset  L_dev = applier.apply(DF_dev) L_dev ``` Combine with the original data to see how we might improve our results ``` L_dev = pd.DataFrame(L_dev, columns = ["between_friend", "left_friend"]) L_dev DF_combo = pd.concat([DF_dev, L_dev], axis = 1) #axis = 1 by column DF_combo ``` Combine them together to have a friend (1 or 2) and not friend (0) ``` DF_combo['friend'] = DF_combo['left_friend'] + DF_combo['between_friend'] DF_combo ``` ### Knowledge Graph Create co-occurrences dataframe - basically sum up the number of times our pairs of combinations occurred. Also want to exclude where entity1 == entity2. ``` # entities cannot be equal DF_combo = DF_combo[DF_combo['entity1'] != DF_combo['entity2']] # only our friend options DF_friend = DF_combo[DF_combo['friend'] > 0] # only our not friend options DF_not = DF_combo[DF_combo['friend'] == 0] print(DF_friend.shape) print(DF_not.shape) ``` ``` # create the frequency cooc_df_not = DF_not[['entity1', 'entity2']]\ .groupby(['entity1', 'entity2'])\ .size()\ .reset_index(name='freq') cooc_df_not cooc_df_friend = DF_friend[['entity1', 'entity2']]\ .groupby(['entity1', 'entity2'])\ .size()\ .reset_index(name='freq') cooc_df_friend ``` ``` graph = nx.from_pandas_edgelist( cooc_df_friend[['entity1', 'entity2', 'freq']] \ .rename(columns={'freq': 'weight'}), source='entity1', target='entity2', edge_attr=True) pos = nx.kamada_kawai_layout(graph, weight='weight') _ = plt.figure(figsize=(20, 20)) nx.draw(graph, pos, node_size=1000, node_color='skyblue', alpha=0.8, with_labels = True, font_size = 10) plt.title('Graph Visualization', size=15) for (node1,node2,data) in graph.edges(data=True): width = data['weight'] _ = nx.draw_networkx_edges(graph,pos, edgelist=[(node1, node2)], width=width, edge_color='#505050', alpha=0.5) plt.show() ``` ``` graph = nx.from_pandas_edgelist( cooc_df_not[['entity1', 'entity2', 'freq']] \ .rename(columns={'freq': 'weight'}), source='entity1', target='entity2', edge_attr=True) pos = nx.kamada_kawai_layout(graph, weight='weight') _ = plt.figure(figsize=(20, 20)) nx.draw(graph, pos, node_size=1000, node_color='skyblue', alpha=0.8, with_labels = True, font_size = 10) plt.title('Graph Visualization', size=15) for (node1,node2,data) in graph.edges(data=True): width = data['weight'] _ = nx.draw_networkx_edges(graph,pos, edgelist=[(node1, node2)], width=width, edge_color='#505050', alpha=0.5) plt.show() ``` ## How to Do Spacy + Snorkel together Example Spacy Pipeline Addition - Based on the chosen text, add entities to a default spacy model. (entity_ruler) - Add a norm_entity, merge_entity, and init_coref pipelines. - Update and add the alias lookup if necessary for the data. - Add the name resolver pipeline. - Create a co-occurrence graph of the entities linked together in your text. ### Entity ruler - adding entities ``` # we want to update the spacy training before we use it in processed = nlp(x) in this function from spacy.pipeline import EntityRuler incorrect_tags = ["One", "Trinity", "Morpheus"] # dictionary with label and pattern is the rules patterns = [{"label": "PERSON", # whatever spacy calls it # put in the regex "pattern": [{"TEXT": "The", "OP": "?"}, # The One but the THE is optional {"TEXT": {"IN": incorrect_tags}} ] } ] print(patterns) ruler = nlp.add_pipe('entity_ruler', before='ner') ruler.add_patterns(patterns) ``` ### Norm entities - not much to do here ``` from spacy.tokens import Span from spacy import Language @Language.component("norm_entities") def norm_entities(doc): ents = [] for ent in doc.ents: if ent[0].pos_ == "DET": # leading article ent = Span(doc, ent.start+1, ent.end, label=ent.label) if len(ent) > 0: if ent[-1].pos_ == "PART": # trailing particle like 's ent = Span(doc, ent.start, ent.end-1, label=ent.label) ents.append(ent) doc.ents = tuple(ents) return doc nlp.add_pipe('norm_entities') ``` ### Merge entities - do not need to do much here ``` from spacy.pipeline import merge_entities if nlp.has_pipe('merge_entities'): ### _ = nlp.remove_pipe('merge_entities') ### nlp.add_pipe('merge_entities') ``` ### Initial Coreference Here we need to change the entity types we are interested in ``` # not in book, but useful if you modify the extension from spacy.tokens import Token if Token.has_extension('ref_n'): _ = Token.remove_extension('ref_n') if Token.has_extension('ref_t'): _ = Token.remove_extension('ref_t') if Token.has_extension('ref_t_'): _ = Token.remove_extension('ref_t_') from spacy.tokens import Token Token.set_extension('ref_n', default='') #ref name Token.set_extension('ref_t', default='') #ref type @Language.component("init_coref") def init_coref(doc): for e in doc.ents: if e.label_ in ['PERSON']: e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_ return doc nlp.add_pipe("init_coref") ``` ### Name Resolver Here we deal with upper case versus lower case and then things in parentheses if necessary ``` def name_match(m1, m2): m2 = re.sub(r'[()\.]', '', m2).lower() # ignore parentheses and dots and lower case # so if m1 and m2 are both the same word after lower casing, return true m1 = m1.lower() m2 = r'\b' + m2 + r'\b' # \b marks word boundary m2 = re.sub(r'\s+', r'\\b.*\\b', m2) return re.search(m2, m1, flags=re.I) is not None @Language.component("propagate_ent_type") def propagate_ent_type(doc): """propagate entity type stored in ref_t""" ents = [] for e in doc.ents: if e[0]._.ref_n != '': # if e is a coreference e = Span(doc, e.start, e.end, label=e[0]._.ref_t) ents.append(e) doc.ents = tuple(ents) return doc @Language.component("name_resolver") def name_resolver(doc): """create name-based reference to e1 as primary mention of e2""" ents = [e for e in doc.ents if e.label_ in ['PERSON']] for i, e1 in enumerate(ents): for e2 in ents[i+1:]: if name_match(e1[0]._.ref_n, e2[0].text): e2[0]._.ref_n = e1[0]._.ref_n e2[0]._.ref_t = e1[0]._.ref_t return propagate_ent_type(doc) nlp.add_pipe('name_resolver') ``` ``` # first get the entities, must be two for relationship matches # line 3, create a function to grab the entities so we can use apply later def get_entities(x): """ Grabs the names using spacy's entity labeler """ # get all the entities in this row processed = nlp(x) # get the tokens for each sentence tokens = [word.text for word in processed] # get all the entities temp = [(str(ent), ent.label_) for ent in processed.ents if ent.label_ != ""] # only move on if this row has at least two if len(temp) > 1: # finds all the combinations of pairs temp2 = list(combinations(temp, 2)) # for each pair combination for (person1, person2) in temp2: # find the names in the person 1 person1_words = [word.text for word in nlp(person1[0])] # find the token numbers for person 1 person1_ids = [i for i, val in enumerate(tokens) if val in person1_words] # output in (start, stop) token tuple format if len(person1_words) > 1: person1_ids2 = tuple(idx for idx in person1_ids[0:2]) else: id_1 = [idx for idx in person1_ids] person1_ids2 = (id_1[0], id_1[0]) # do the same thing with person 2 person2_words = [word.text for word in nlp(person2[0])] person2_ids = [i for i, val in enumerate(tokens) if val in person2_words[0:2]] if len(person2_words) > 1: person2_ids2 = tuple(idx for idx in person2_ids[0:2]) else: id_2 = [idx for idx in person2_ids] person2_ids2 = (id_2[0], id_2[0]) # store all this in a list stored_entities.append( [x, # original text tokens, # tokens person1[0], # person 1 name person2[0], # person 2 name person1_ids2, # person 1 id token tuple person2_ids2 # person 2 id token tuple ]) DF.columns ``` ``` # create an empty spot to save the data stored_entities = [] DF['clean'].apply(get_entities) # create dataframe in snorkel structure DF_dev = pd.DataFrame(stored_entities, columns = ["sentence", "tokens", "entity1", "entity2", "entity1_word_idx", "entity2_word_idx"]) DF_dev ``` # Text Summarization ## Libraries ```{python} import pandas as pd import PyPDF2 import re import nltk from sentence_transformers import SentenceTransformer import faiss import time # define a search def search(query, model, text_list): t=time.time() query_vector = model.encode([query]) k = 5 top_k = index.search(query_vector, k) print('totaltime: {}'.format(time.time()-t)) return [text_list[_id] for _id in top_k[1].tolist()[0]] from sumy.summarizers.text_rank import TextRankSummarizer from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words from sumy.summarizers.lsa import LsaSummarizer LANGUAGE = "english" stemmer = Stemmer(LANGUAGE) from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from gensim.models import LdaModel from gensim.corpora import Dictionary import pyLDAvis import pyLDAvis.gensim_models #don't skip this import matplotlib.pyplot as plt def preprocess(textstring): stops = set(stopwords.words('english')) tokens = word_tokenize(textstring) return [token.lower() for token in tokens if token.isalpha() and token not in stops] from rouge_score import rouge_scorer def print_rouge_score(rouge_score): for k,v in rouge_score.items(): print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure)) ``` ## Find Text - Wireland ranch episode 1 podcast - import the text - break into sentences - put into pandas - see if we need to clean ``` # creating a pdf file object pdfFileObj = open('wireland_ranch.pdf', 'rb') # creating a pdf reader object pdfReader = PyPDF2.PdfReader(pdfFileObj) # printing number of pages in pdf file len(pdfReader.pages) # creating a page object pageObj = pdfReader.pages # extracting text from page # loop here to get it all text = [] for page in pageObj: page = re.sub("\\n", " ", page.extract_text()) text.append(page) print(text[0]) ``` ``` text = ' '.join(text) sentences = nltk.sent_tokenize(text) len(sentences) DF = pd.DataFrame(sentences, columns = ['sentence']) DF.head() ``` Create A Search Engine - Using each sentence as your “documents”, create a search engine to find specific pieces of text. ``` # only need to run this thing once and once it is # saved, you can "turn off" the chunk using eval = F in # Rstudio, or change the code type to markdown to save # the code for yourself in datalore but not run it # Load a pre-trained model model = SentenceTransformer('msmarco-MiniLM-L-12-v3') wireland_embed = model.encode(DF['sentence'].to_list()) #same as sentences, but helps to have a DF in case you needed to do other cleaning # Create an index using FAISS index = faiss.IndexFlatL2(wireland_embed.shape[1]) index.add(wireland_embed) faiss.write_index(index, 'index_wireland_reviews') ``` Search for several items. ``` # read in the index later when you need to use this again index = faiss.read_index('index_wireland_reviews') # you do have to have the model open too model = SentenceTransformer('msmarco-MiniLM-L-12-v3') search("overseer", model, DF['sentence'].to_list(), 5) search("crime", model, DF['sentence'].to_list(), 5) search("delivery", model, DF['sentence'].to_list(), 5) ``` Examine the results and comment on how well you think the search engine worked. ANSWER THIS QUESTION Create Text Summaries Create a human summary of the text. ``` human_summary = "A crumbling shack in the Mojave desert houses the heartbeat at the center of the universe. Long ago, the heart was diminished to a parasite when new spoiled gods built of the more distasteful human energies usurped the throne and began their own type of reign. They battled and bickered and those arguments translated to our world in the form of tragedies, mass rituals, and monied black magic. Now, it seems that history is coming to a head and our spoiled gods are fighting harder than they have ever fought before. Dead in the center of all of this are two humans. One an unwitting delivery driver turned host for the parasitic heart. The other a disgraced drug addicted cop who went searching for the driver on behalf of his family. Everything else, we will discover together." chatgpt_summary = "Episode 1: The Return of the Overseer follows the story of a delivery driver who receives an unusual order that takes him to Reynold’s Limited Curiosities, a mysterious shop. The driver encounters a strange, sentient desk in the shop, which is inhabited by a sphinx-like creature. Despite warnings from an enigmatic voice, the driver approaches the desk and is attacked by the creature, only to be saved by multicolored worms that emerge from the light fixtures. The creature is destroyed, and a glowing figure appears before the driver, guiding him to a room filled with jars containing strange contents. The figure disappears, leaving the driver bewildered. He quickly grabs his delivery and flees the shop, experiencing strange phenomena and feeling disconnected from reality." ``` - Create text summaries using LSA, TextRank, and Topic Modeling. ### Text Rank ``` num_summary_sentences = 5 # be sure to put in one big long string # this will parse things into sentences for summarization parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) # builds a summarizer with a stemmer (which grabs english from above) summarizer = TextRankSummarizer(stemmer) # add the stops for the language we set (english) summarizer.stop_words = get_stop_words(LANGUAGE) tr_sum = [] for sentence in summarizer(parser.document, num_summary_sentences): tr_sum.append(str(sentence)) tr_sum = " ".join(tr_sum) tr_sum ``` ### LSA ``` summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) lsa_sum = [] for sentence in summarizer(parser.document, num_summary_sentences): lsa_sum.append(str(sentence)) lsa_sum = " ".join(lsa_sum) lsa_sum ``` ### Topic Modeling ``` # remember all the stuff from earlier that was loaded # Create a dictionary representation of the documents. # use our list of sentences from earlier processed_sentences = [preprocess(sent) for sent in sentences] # create the vocabulary list dictionary = Dictionary(processed_sentences) # convert to a term by document matrix corpus = [dictionary.doc2bow(sent) for sent in processed_sentences] # Train the topic model LDAmodel = LdaModel(corpus = corpus, id2word = dictionary, iterations = 400, num_topics = 10, random_state = 100, update_every = 1, chunksize = 100, passes = 10, alpha = 'auto', per_word_topics = True) probs = [LDAmodel.get_document_topics(sentence) for sentence in corpus] save_probs = [] i = 0 # looping variable for document in probs: for (topic, prob) in document: if topic == 0: # this is the topic zero but you can pick another one save_probs.append((sentences[i], prob)) i = i + 1 DF = pd.DataFrame(save_probs, columns = ["sentence", "prob"]) topic_sum = " ".join(DF.sort_values(by = ["prob"], ascending = False)[0:num_summary_sentences].sentence) topic_sum ``` - Assess those summaries using the Rouge-N analyzer. ``` # build a blank model scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) # add the gold standard and summary you want to compare # scores = scorer.score(gold_standard, summary) # print the scores # print_rouge_score(scores) # compare to overall podcast paragraph print_rouge_score(scorer.score(human_summary, tr_sum)) print_rouge_score(scorer.score(human_summary, lsa_sum)) print_rouge_score(scorer.score(human_summary, topic_sum)) # compare to chat gpt print_rouge_score(scorer.score(chatgpt_summary, tr_sum)) print_rouge_score(scorer.score(chatgpt_summary, lsa_sum)) print_rouge_score(scorer.score(chatgpt_summary, topic_sum)) ``` - Which summary was the best when compared to the human summary? ANSWER THIS QUESTION - Visualization of topic models. ``` vis = pyLDAvis.gensim_models.prepare(LDAmodel, corpus, dictionary, n_jobs = 1) pyLDAvis.save_html(vis, 'LDA_Visualization.html') ##saves the file ``` # Classification ## Libraries ``` import pysrt import pandas as pd import nltk #nltk.download("stopwords") # only the first time #nltk.download("punkt") from nltk.corpus import stopwords mystopwords = set(stopwords.words("english")) import spacy import subprocess print(subprocess.getoutput("python -m spacy download en_core_web_sm")) nlp = spacy.load("en_core_web_sm") import contractions # function def data_clean(text): text = text.lower() # lower case # punctuation ? # contractions text = contractions.fix(text) # stop words text = " ".join([word for word in nltk.word_tokenize(text) if word not in mystopwords]) # lemmatization temp = nlp(text) text = " ".join([word.lemma_ for word in temp]) return(text) from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from gensim.models import Word2Vec # our flattening function from last time import numpy as np def document_vectorizer(corpus, model, num_features): vocabulary = set(model.wv.index_to_key) def average_word_vectors(words, model, vocabulary, num_features): feature_vector = np.zeros((num_features,), dtype="float64") nwords = 0. for word in words: if word in vocabulary: nwords = nwords + 1. feature_vector = np.add(feature_vector, model.wv[word]) if nwords: feature_vector = np.divide(feature_vector, nwords) return feature_vector features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus] return np.array(features) from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report import eli5 from lime.lime_text import LimeTextExplainer import matplotlib.pyplot as plt from sklearn.pipeline import make_pipeline ``` ## Data ``` subs = pysrt.open("airbud.srt") DF = pd.DataFrame([ { "Text": sub.text, "Start": sub.start.seconds, "End": sub.end.seconds } for sub in subs]) DF.head() feel_DF = pd.read_csv("Emotion_classify_Data.csv") ``` ## Set up Text - Do preprocessing on your text to prepare it for the final machine learning model. ``` DF['clean'] = DF['Text'].apply(data_clean) feel_DF['clean'] = feel_DF['review'].apply(data_clean) ``` - What items do you think will be important in your preprocessing to clean? ANSWER THIS QUESTION ``` DF.to_csv("airbud_process.csv") feel_DF.to_csv("imdb_process.csv") ``` ``` DF = pd.read_csv("airbud_process.csv") feel_DF = pd.read_csv("imdb_process.csv") ``` ## Split the Modeling Data ``` # you can have mulitple arguments on the left side of a = X_train, X_test, Y_train, Y_test = train_test_split(feel_DF['clean'], # X values feel_DF['sentiment'], # Y values test_size = 0.2, # test size random_state = 89543, # random shuffle stratify = feel_DF['sentiment']) print(X_train.head()) print(Y_train.head()) ``` ## Create Feature Extractions - Create a “one-hot” encoding using the count vectorizer and binary options. ``` # count vectorizer - one hot # create a blank extractor one_hot = CountVectorizer(binary=True) # fit the data to it oh_train = one_hot.fit_transform(X_train) # transform the second data to it matches to the fit_transform vocab oh_test = one_hot.transform(X_test) print(oh_train.shape) print(oh_test.shape) ``` - Create the bag of words encoding using the count vectorizer. ``` # create a blank extractor bow = CountVectorizer() # fit the data to it bow_train = bow.fit_transform(X_train) # transform the second data to it matches to the fit_transform vocab bow_test = bow.transform(X_test) print(bow_train.shape) print(bow_test.shape) ``` - Create the TF-IDF normalization using the tfidf vectorizer. ``` # create a blank extractor tfidf = TfidfVectorizer() # fit the data to it tf_train = tfidf.fit_transform(X_train) # transform the second data to it matches to the fit_transform vocab tf_test = tfidf.transform(X_test) print(tf_train.shape) print(tf_test.shape) ``` - Create two word2vec models: - Use a large number of dimensions that matches your tfidf. - Using cbow and skipgram embeddings. - Using a 5 window size. ``` wv_c = Word2Vec(X_train, vector_size = 500, #dimensions window = 5, #window size sg = 0, #cbow min_count = 1, workers = 4) # generate averaged word vector features from word2vec model wv_c_train = document_vectorizer(corpus = X_train, model = wv_c, num_features = 500) # generate averaged word vector features from word2vec model wv_c_test = document_vectorizer(corpus = X_test, model = wv_c, num_features = 500) print(wv_c_train.shape) print(wv_c_test.shape) ``` ``` wv_s = Word2Vec(X_train, vector_size = 500, #dimensions window = 5, #window size sg = 1, #cbow min_count = 1, workers = 4) # generate averaged word vector features from word2vec model wv_s_train = document_vectorizer(corpus = X_train, model = wv_s, num_features = 500) # generate averaged word vector features from word2vec model wv_s_test = document_vectorizer(corpus = X_test, model = wv_s, num_features = 500) print(wv_s_train.shape) print(wv_s_test.shape) ``` ## What is the OOV? ``` # original vocabulary train_vocab = set(one_hot.get_feature_names_out()) one_hot2 = CountVectorizer(binary=True) oh_test = one_hot2.fit_transform(X_test) test_vocab = set(one_hot2.get_feature_names_out()) # test_vocab is got what that training doesn't len(test_vocab.difference(train_vocab)) #test_vocab.difference(train_vocab) ``` ## Logistic Regression ``` # log regression # build a blank model logreg = LogisticRegression(max_iter = 10000) # fit the training data to the model logreg.fit(oh_train, Y_train) # predict test cases y_pred = logreg.predict(oh_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ```{python} # log regression # build a blank model logreg = LogisticRegression(max_iter = 10000) # fit the training data to the model logreg.fit(bow_train, Y_train) # predict test cases y_pred = logreg.predict(bow_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ```{python} # log regression # build a blank model logreg = LogisticRegression(max_iter = 10000) # fit the training data to the model logreg.fit(tf_train, Y_train) # predict test cases y_pred = logreg.predict(tf_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ``` # log regression # build a blank model logreg = LogisticRegression(max_iter = 10000) # fit the training data to the model logreg.fit(wv_c_train, Y_train) # predict test cases y_pred = logreg.predict(wv_c_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ``` # log regression # build a blank model logreg = LogisticRegression(max_iter = 10000) # fit the training data to the model logreg.fit(wv_s_train, Y_train) # predict test cases y_pred = logreg.predict(wv_s_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ## Naive Bayes ```{python} # bayes # build a blank model nb = MultinomialNB() # fit the training data to the model nb.fit(oh_train, Y_train) # predict test cases y_pred = nb.predict(oh_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ```{python} # bayes # build a blank model nb = MultinomialNB() # fit the training data to the model nb.fit(bow_train, Y_train) # predict test cases y_pred = nb.predict(bow_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ```{python} # bayes # build a blank model nb = MultinomialNB() nb.fit(tf_train, Y_train) # predict test cases y_pred = nb.predict(tf_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ```{python} # no negative predictors allowed in Bayes wv_c_train.min() wv_c_test.min() wv_s_train.min() wv_s_test.min() wv_c_train = wv_c_train + 1 wv_c_test = wv_c_test + 1 wv_s_train = wv_s_train + 1 wv_s_test = wv_s_test + 1 ``` ```{python} # bayes # build a blank model nb = MultinomialNB() # fit the training data to the model nb.fit(wv_c_train, Y_train) # predict test cases y_pred = nb.predict(wv_c_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ```{python} # bayes # build a blank model nb = MultinomialNB() # fit the training data to the model nb.fit(wv_s_train, Y_train) # predict test cases y_pred = nb.predict(wv_s_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ## Lime ### The final model ```{python} # build a blank model logreg = LogisticRegression(max_iter = 10000) # fit the training data to the model logreg.fit(tf_train, Y_train) # predict test cases y_pred = logreg.predict(tf_test) # compare predicts to the actuals print(classification_report(y_true = Y_test, y_pred = y_pred)) ``` ### Apply to new instances ```{python} # PUT IN THE PIPELINE feature extractor, final algorithm model pipeline = make_pipeline(tfidf, logreg) # build a blank model explainer = LimeTextExplainer(class_names = Y_train.sort_values().unique()) id_value = 3 # first argument is text exp = explainer.explain_instance(feel_DF.iloc[id_value]['clean'], # predict_proba only works on certain models pipeline.predict_proba, num_features=10) exp.as_pyplot_figure() plt.show() exp.save_to_file('example.html') ``` ## eli5 ```{python} eli5.show_weights(estimator = logreg, top = 10, feature_names = tfidf.get_feature_names_out()) ``` ## Apply to airbud ```{python} # use the pipeline to predict your new data (that's been cleaned) DF['answer'] = pipeline.predict(DF['clean'].to_list()) # review the output DF['answer'].value_counts() ``` ## Notes - Often have data we want to predict but no labels - So we found a dataset with labels to build our model - and then we will apply the verified/tested model to the data that we actually want to predict ## Pipeline 1) Text Cleaning - you should have a function that does the cleaning, so it's always the same for dataset (both the training/modeling data and to be predicted data) 2) Apply that text cleaning to both the modeling data (imdb) and the to be predicted data (airbud) separately a) If you have a big dataset, consider processing and saving the processed output, so you save time b) Then you would only reload the processed data, not having to do step 1 and 2 again. 3) Split the modeling data into testing and training --> build a model and then testing it on new data to make sure it generalizes a) Most of the data goes into training - you want to get all the vocab words into the model (if a vocab is only in the testing set, it is not included in the model) 4) Build our Feature Extractions a) We can't use our predictor variable at the moment --> it's text ... it needs to be numbers, so let's convert it to numbers b) build a blank extractor, fit_transform to build the vocabulary and the train data, and then transform to convert the test data to the same shape (vocabulary size) 5) Build the model that combines algorithm with feature extraction a) Build our blank model b) Fit the training data to the model c) Apply that model to the testing data d) Examine the results (classification report) 6) Pick a model that works best a) want to look at accuracy overall b) look at recall and precision - or the F1 score for each group c) models that do NOT predict a specific category (i.e., you get the zero division error or f1 = 0) are not good, don't use them 7) apply that model to our new data a) Use the pipeline feature to put together the extraction and algorithm b) look at the final value counts for your prediction and interpret # Chatbots ## Find Text - Find a list of movies and themes to pick from. ``` import pandas as pd # openpyxl - you may need to install this package df = pd.read_excel("movie.xlsx") user_input = "Scifi" df[df['type'] == user_input]['recommend'].iloc[0] ``` ## Train the Chatbot - As a class, let’s train a chatbot to tell us recommend a movie based on theme we pick. - in the terminal ``` rasa init ``` ### nlu ``` version: "3.1" nlu: - intent: greet examples: | - hey - hello - hi - hello there - good morning - good evening - moin - hey there - let's go - hey dude - goodmorning - goodevening - good afternoon - intent: goodbye examples: | - cu - good by - cee you later - good night - bye - goodbye - have a nice day - see you around - bye bye - see you later - intent: affirm examples: | - yes - y - indeed - of course - that sounds good - correct - intent: deny examples: | - no - n - never - I don't think so - don't like that - no way - not really - intent: mood_great examples: | - perfect - great - amazing - feeling like a king - wonderful - I am feeling very good - I am great - I am amazing - I am going to save the world - super stoked - extremely good - so so perfect - so good - so perfect - intent: mood_unhappy examples: | - my day was horrible - I am sad - I don't feel very well - I am disappointed - super sad - I'm so sad - sad - very sad - unhappy - not good - not very good - extremly sad - so saad - so sad - intent: bot_challenge examples: | - are you a bot? - are you a human? - am I talking to a bot? - am I talking to a human? - intent: movie examples: | - what should I watch? - what is a good movie? - I am bored. - What to watch tonight? - Want to watch a movie? - intent: get_recommend examples: | - [Comedy](what_type) - [Western](what_type) - [Scifi](what_type) - [Romance](what_type) - intent: thanks examples: | - thank you - thank you very much - thanks - great, thanks ``` ### domain ``` version: "3.1" intents: - greet - goodbye - affirm - deny - mood_great - mood_unhappy - bot_challenge - movie - get_recommend - thanks responses: utter_greet: - text: "Hey! How are you?" utter_cheer_up: - text: "Here is something to cheer you up:" image: "https://i.imgur.com/nGF1K8f.jpg" utter_did_that_help: - text: "Did that help you?" utter_happy: - text: "Great, carry on!" utter_goodbye: - text: "Bye" utter_iamabot: - text: "I am a bot, powered by Rasa." utter_question_movie: - text: "What type of movie do you want to watch?" utter_options: - text: "You can pick Comedy, Romance, Scifi, or Western." utter_holdon: - text: "Please wait while I look up a recommendation." utter_recommend: - text: "Here's your movie:" utter_welcome: - text: "You're welcome!" entities: - what_type slots: what_type: type: text mappings: - type: from_entity entity: what_type actions: - action_get_movie session_config: session_expiration_time: 60 carry_over_slots_to_new_session: true ``` ### stories ``` version: "3.1" stories: - story: happy path steps: - intent: greet - action: utter_greet - intent: mood_great - action: utter_happy - story: sad path 1 steps: - intent: greet - action: utter_greet - intent: mood_unhappy - action: utter_cheer_up - action: utter_did_that_help - intent: affirm - action: utter_happy - story: sad path 2 steps: - intent: greet - action: utter_greet - intent: mood_unhappy - action: utter_cheer_up - action: utter_did_that_help - intent: deny - action: utter_goodbye - story: movie recommend steps: - intent: movie - action: utter_question_movie - action: utter_options - intent: get_recommend - action: utter_holdon - action: action_get_movie - action: utter_recommend - intent: thanks - action: utter_welcome ``` ### actions.py ``` # This files contains your custom actions which can be used to run # custom Python code. # # See this guide on how to implement these action: # https://rasa.com/docs/rasa/custom-actions # This is a simple example for a custom action which utters "Hello World!" from typing import Any, Text, Dict, List from rasa_sdk import Action, Tracker from rasa_sdk.executor import CollectingDispatcher class ActionGetMovie(Action): def name(self) -> Text: return "action_get_movie" def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: # see if they entered something correct what_type = tracker.get_slot('what_type') # what they wrote types = ["Comedy", "Scifi", "Romance", "Western"] #options import pandas as pd #our background stuff df = pd.read_excel("movie.xlsx") # see if it lines up if (what_type in types): output = df[df['type'] == what_type]['recommend'].iloc[0] else: output = "You did not enter a movie type I know about." dispatcher.utter_message(text=output) return [] ``` ### endpoints ``` # This file contains the different endpoints your bot can use. # Server where the models are pulled from. # https://rasa.com/docs/rasa/model-storage#fetching-models-from-a-server #models: # url: http://my-server.com/models/default_core@latest # wait_time_between_pulls: 10 # [optional](default: 100) # Server which runs your custom actions. # https://rasa.com/docs/rasa/custom-actions action_endpoint: url: "http://localhost:5055/webhook" # Tracker store which is used to store the conversations. # By default the conversations are stored in memory. # https://rasa.com/docs/rasa/tracker-stores #tracker_store: # type: redis # url: <host of the redis instance, e.g. localhost> # port: <port of your redis instance, usually 6379> # db: <number of your database within redis, e.g. 0> # password: <password used for authentication> # use_ssl: <whether or not the communication is encrypted, default false> #tracker_store: # type: mongod # url: <url to your mongo instance, e.g. mongodb://localhost:27017> # db: <name of the db within your mongo instance, e.g. rasa> # username: <username used for authentication> # password: <password used for authentication> # Event broker which all conversation events should be streamed to. # https://rasa.com/docs/rasa/event-brokers #event_broker: # url: localhost # username: username # password: password # queue: queue ``` ## Test the Chatbot - Test the chatbot responses. ``` rasa train && rasa shell ``` ``` rasa run actions ``` Turn in: take a screenshot of your terminal talking to the bot asking for a movie recommendation.