# ANLY 520 Late Spring
# Preprocessing Text
## Libraries
```
import urllib
import pandas as pd
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
import re
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
def impurity(text, min_len=10):
"""returns the share of suspicious characters in a text"""
if text == None or len(text) < min_len:
return 0
else:
return len(RE_SUSPICIOUS.findall(text))/len(text)
import textacy.preprocessing as tprep
import spacy
def normalize(text):
text = tprep.normalize.hyphenated_words(text)
text = tprep.normalize.quotation_marks(text)
text = tprep.normalize.unicode(text)
text = tprep.remove.accents(text)
text = tprep.replace.phone_numbers(text)
text = tprep.replace.urls(text)
text = tprep.replace.emails(text)
text = tprep.replace.user_handles(text)
text = tprep.replace.emojis(text)
text = text.lower()
return text
from spellchecker import SpellChecker
spell = SpellChecker()
import textacy
import spacy
# only cloud stuff
#%%
import subprocess
#%%
print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
# back to everyone
nlp = spacy.load("en_core_web_sm")
from itertools import chain
from collections import Counter
```
- Regex notes:
- Anything `[]` inside here can be included
- `[a-z]` this would match any lower cased alpha character
- `[&#<>{}\[\]\\]`:
- Any of the following characters
- & # <> {}
- [] and \ but these are special characters so you have to escape them with `\`
## Find Text
- As a class, we will find a text source to analyze. This text source usually will consist of a webpage or other dataset to examine and clean.
- Import the text into your report.
```
url = "https://www.npr.org/2024/03/17/1239056873/devil-comet-solar-eclipse-12p-pons-brooks?utm_source=pocket-newtab-en-us"
html = urllib.request.urlopen(url)
type(html)
html[0:100]
```
```
# clean out the html get only the text
soupified = BeautifulSoup(html, "html.parser") # start simple
type(soupified)
data = soupified.get_text().strip()
type(data)
print(data[0:100])
```
- If the text is one big long string, first break into sentence segments and store it in a Pandas DataFrame.
```
# make this a data frame
DF = pd.DataFrame(
nltk.sent_tokenize(data),
columns = ["text"]
)
DF.head()
```
## Fix Errors
- Examine the text for errors or problems by looking at the text.
Not a whole lot wrong here.
- Use the “impurity” function from class to examine the text for potential issues.
```
DF['impurity'] = DF['text'].apply(impurity)
DF
```
- Remove the noise with the regex function.
Not necessary.
- Re-examine the impurity to determine if the data has been mostly cleaned.
Not necessary.
- Normalize the rest of the text by using textacy.
```
DF['normalize'] = DF['text'].apply(normalize)
```
- Examine spelling errors in at least one row of the dataset.
```
# find those words that may be misspelled
misspelled = spell.unknown(nltk.word_tokenize(DF['normalize'].iloc[1])) # or DF['normalize'][0]
for word in misspelled:
# what is the word
print(word)
# Get the one `most likely` answer
print(spell.correction(word))
# Get a list of `likely` options
print(spell.candidates(word))
print("----\n")
```
- How would I really use this?
- Find all the tokens in the text `nltk.word_tokenize` on the whole text
- Create a dictionary of commonly misspelled words and their fixes
- Use `re` package to find and replace those common words and apply their fixes
## Pre-Processing
- Using spacy and textacy, pre-process the text to end up with a list of tokenized lists.
- List of tokenized lists is useful over a pandas dataframe because we don't know how long each sentence will be
- Pandas requires things to be rectangular so that won't work
```
# need a spot to store stuff
output = []
# only the tagger and lemmatizer
for doc in nlp.pipe(DF['normalize'].to_list(), disable=["tok2vec", "ner", "parser"]):
tokens = textacy.extract.words(doc,
filter_stops = True, # default True, no stopwords
filter_punct = True, # default True, no punctuation
filter_nums = True, # default False, no numbers
include_pos = None, # default None = include all
exclude_pos = None, # default None = exclude none
min_freq = 1) # minimum frequency of words
output.append([str(pizza) for pizza in tokens]) # close output append
```
```
print(type(output)) # whole thing is a list
print(type(output[0])) # each item is a list
print(type(output[0][1])) # each item within the list is a str
# spacy tokens are ANNOYING
```
- Create a frequency table of each of the tokens returned in this output. Below is some example code to get us started.
```
count_dictionary = Counter(chain.from_iterable(output))
# example of a dictionary {'key': value pairs}
count_dictionary.most_common(10)
# by using most common, we get back a list [] of tuples ()
# list[0](1) = 4 not possible
```
## Summary (on your own)
Write a paragraph explaining the process of cleaning data for an NLP pipeline. You should explain the errors you found in the dataset and how you fixed them. Explain the information that is gathered by using spacy and textacy and the final output. What did you learn from your frequency table? What is the text document about?
# Proposal of Text
```
# show how many tokens something is?
import nltk
# take the big long string of the document
# before it's converted into pandas
type(data)
len(nltk.word_tokenize(data))
```
# Information Extraction
## Libraries
```
import pysrt
import pandas as pd
import spacy
import textacy
# only for datalore
import subprocess
print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
# everyone
nlp = spacy.load("en_core_web_sm")
import re
def clean_up(text):
text = re.sub('</?[a-z]>', ' ', text)
text = re.sub('♪', ' ', text)
return text
from snorkel.preprocess import preprocessor
from snorkel.types import DataPoint
from itertools import combinations
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from itertools import combinations
import math
from tqdm import tqdm
import networkx as nx
from matplotlib import pyplot as plt
```
## Import and Clean
```
# import and clean
subs = pysrt.open("barbie.srt")
DF = pd.DataFrame([
{
"Text": sub.text,
"Start": sub.start.seconds,
"End": sub.end.seconds
} for sub in subs])
DF.head()
```
```
# cleaning
DF['clean'] = DF['Text'].apply(clean_up)
DF.head()
```
## Part of Speech Tagging
- Tag your data with spacy’s part of speech tagger.
- Convert this data into a Pandas DataFrame.
```
# part of speech tagging
whole_text = " ".join(DF['clean'].to_list())
spacy_pos_tagged = [(str(word), word.tag_, word.pos_) for word in nlp(whole_text)]
DF_tags = pd.DataFrame(spacy_pos_tagged, columns = ['token', 'specific_pos', 'universal_pos'])
DF_tags
```
- Use the dataframe to calculate the most common parts of speech.
```
DF_tags['universal_pos'].value_counts()
```
- Use the dataframe to calculate if words are considered more than one part of speech (crosstabs or groupby).
```
# print(type(DF_tags['token'].iloc[4]))
# way to check types
DF_cross_tabs = pd.crosstab(DF_tags['token'], DF_tags['universal_pos'])
DF_cross_tabs['total_tags'] = DF_cross_tabs.astype(bool).sum(axis=1)
DF_cross_tabs.sort_values('total_tags', ascending=False).head(10)
```
- What is the most common part of speech?
- Do you see words that are multiple parts of speech?
BE SURE TO ANSWER THE QUESTIONS
## KPE
- Use textacy to find the key phrases in your text.
- Using textacy utilities, combine like key phrases.
```
# py_install("networkx < 3.0", pip = T)
# use with textacy version 0.12.0 and earlier
# or pip install networkx < 3.0 for other solutions
# textacy 0.13.0 requires python 3.9
# KPE
# build an english language for textacy pipe
en = textacy.load_spacy_lang("en_core_web_sm", disable=("parser"))
# build a processor for textacy using spacy and process text
doc = textacy.make_spacy_doc(whole_text, lang = en)
# text rank algorithm
[kps for kps, weights in textacy.extract.keyterms.textrank(doc, normalize = "lemma", topn = 5)]
```
```
terms = set([term for term, weight in textacy.extract.keyterms.textrank(doc)])
print(textacy.extract.utils.aggregate_term_variants(terms))
```
- Do the outputs make sense given your text?
ANSWER THIS QUESTION
## NER
- Use spacy to extract named entities.
- Create a summary of your named entities.
```
# ner using spacy
spacy_ner_tagged = [(str(ent), ent.label_) for ent in nlp(whole_text).ents]
DF_ner = pd.DataFrame(spacy_ner_tagged, columns = ['token', 'entity'])
DF_ner.head()
DF_ner['entity'].value_counts()
```
This is where you would improve spacy's model using the example code from the lecture
Process with spacy NER is where this starts in the notes
You would add the new entities, merging, co-reference resolution, etc. here before using it in the snorkel pipeline.
This part would improve the extraction of named entities, do some named entity disambiguation, and create the best co-occurence pairs for using snorkel.
- Apply Snorkel to your data to show any relationship between names.
- What might you do to improve the default NER extraction?
1) figure out if the text pieces even have pairs of entities to label together
- getting pairs of entities
- getting the original text, the tokenized text, the entity names, the start/stop token for each entity
```
# snorkel
# create an empty spot to save the data
stored_entities = []
# first get the entities, must be two for relationship matches
# create a function to grab the entities so we can use apply
def get_entities(x):
"""
Grabs the names using spacy's entity labeler
"""
# get all the entities in this row
processed = nlp(x)
# get the tokens for each sentence
tokens = [word.text for word in processed]
# get all the entities
temp = [(str(ent), ent.label_) for ent in processed.ents if ent.label_ != ""]
# only move on if this row has at least two
if len(temp) > 1:
# finds all the combinations of pairs
temp2 = list(combinations(temp, 2))
# for each pair combination
for (entity1, entity2) in temp2:
# find the words for entity 1
entity1_words = [word.text for word in nlp(entity1[0])]
# find the token numbers for entity 1
entity1_ids = [i for i, val in enumerate(tokens) if val in entity1_words]
if len(entity1_words) > 1:
entity1_ids2 = tuple(idx for idx in entity1_ids[0:2])
else:
id_1 = [idx for idx in entity1_ids]
entity1_ids2 = (id_1[0], id_1[0])
# do the same thing with person 2
entity2_words = [word.text for word in nlp(entity2[0])]
entity2_ids = [i for i, val in enumerate(tokens) if val in entity2_words[0:2]]
if len(entity2_words) > 1:
entity2_ids2 = tuple(idx for idx in entity2_ids[0:2])
else:
id_2 = [idx for idx in entity2_ids]
entity2_ids2 = (id_2[0], id_2[0])
# store all this in a list
stored_entities.append(
[x, # original text
tokens, # tokens
entity1[0], # entity 1 name
entity2[0], # entity 2 name
entity1_ids2, # entity 1 id token tuple
entity2_ids2 # entity 2 id token tuple
])
# get_entities("Barbie and Ken went to the store, and Malibu bought a car.")
# list []
# of tuples () with two items 0, 1
# each of those tuples is a tuple of two items (), 0, 1
# where 0 is the word/token, 1 is the entity
```
2) pick a level you want to do this at - you need either sentences or paragraphs
3) apply the function to get the pairwise combinations of entities
```
# create an empty spot to save the data
stored_entities = []
DF['clean'].apply(get_entities)
# create dataframe in snorkel structure
DF_dev = pd.DataFrame(stored_entities,
columns = ["sentence", "tokens", "entity1",
"entity2", "entity1_word_idx", "entity2_word_idx"])
```
```
DF_dev
```
4) what do we want to use to label relationships?
```
# get words between the data points
@preprocessor()
def get_text_between(cand: DataPoint) -> DataPoint:
"""
Returns the text between the two person mentions in the sentence
"""
start = cand.entity1_word_idx[1] + 1
end = cand.entity2_word_idx[0]
cand.between_tokens = cand.tokens[start:end]
return cand
# get words next to the data points
@preprocessor()
def get_left_tokens(cand: DataPoint) -> DataPoint:
"""
Returns tokens in the length 5 window to the left of the person mentions
"""
# TODO: need to pass window as input params
window = 5
end = cand.entity1_word_idx[0]
cand.entity1_left_tokens = cand.tokens[0:end][-1 - window : -1]
end = cand.entity2_word_idx[0]
cand.entity2_left_tokens = cand.tokens[0:end][-1 - window : -1]
return cand
```
5) what are the clues in those markers that I can use to label my relations?
```
# make this part up with words that are clues
friend = {"Hi", "!", "friend", "thanks", "Thanks", "hi",
"and"}
# this part important
is_friend = 1
not_friend = 0
@labeling_function(resources=dict(friend=friend), pre=[get_text_between])
def between_friend(x,friend):
return is_friend if len(friend.intersection(set(x.between_tokens))) > 0 else not_friend
@labeling_function(resources=dict(friend=friend), pre=[get_left_tokens])
def left_friend(x,friend):
if len(friend.intersection(set(x.entity1_left_tokens))) > 0:
return is_friend
elif len(friend.intersection(set(x.entity2_left_tokens))) > 0:
return is_friend
else:
return not_friend
```
6) what do we get back? and how can we improve?
```
# create a list of functions to run
# lfs labeling functions
lfs = [
between_friend,
left_friend
]
# build the applier function
applier = PandasLFApplier(lfs)
# run it on the dataset
L_dev = applier.apply(DF_dev)
L_dev
```
Combine with the original data to see how we might improve our results
```
L_dev = pd.DataFrame(L_dev, columns = ["between_friend", "left_friend"])
L_dev
DF_combo = pd.concat([DF_dev, L_dev], axis = 1) #axis = 1 by column
DF_combo
```
Combine them together to have a friend (1 or 2) and not friend (0)
```
DF_combo['friend'] = DF_combo['left_friend'] + DF_combo['between_friend']
DF_combo
```
### Knowledge Graph
Create co-occurrences dataframe - basically sum up the number of times our pairs of combinations occurred.
Also want to exclude where entity1 == entity2.
```
# entities cannot be equal
DF_combo = DF_combo[DF_combo['entity1'] != DF_combo['entity2']]
# only our friend options
DF_friend = DF_combo[DF_combo['friend'] > 0]
# only our not friend options
DF_not = DF_combo[DF_combo['friend'] == 0]
print(DF_friend.shape)
print(DF_not.shape)
```
```
# create the frequency
cooc_df_not = DF_not[['entity1', 'entity2']]\
.groupby(['entity1', 'entity2'])\
.size()\
.reset_index(name='freq')
cooc_df_not
cooc_df_friend = DF_friend[['entity1', 'entity2']]\
.groupby(['entity1', 'entity2'])\
.size()\
.reset_index(name='freq')
cooc_df_friend
```
```
graph = nx.from_pandas_edgelist(
cooc_df_friend[['entity1', 'entity2', 'freq']] \
.rename(columns={'freq': 'weight'}),
source='entity1', target='entity2', edge_attr=True)
pos = nx.kamada_kawai_layout(graph, weight='weight')
_ = plt.figure(figsize=(20, 20))
nx.draw(graph, pos,
node_size=1000,
node_color='skyblue',
alpha=0.8,
with_labels = True,
font_size = 10)
plt.title('Graph Visualization', size=15)
for (node1,node2,data) in graph.edges(data=True):
width = data['weight']
_ = nx.draw_networkx_edges(graph,pos,
edgelist=[(node1, node2)],
width=width,
edge_color='#505050',
alpha=0.5)
plt.show()
```
```
graph = nx.from_pandas_edgelist(
cooc_df_not[['entity1', 'entity2', 'freq']] \
.rename(columns={'freq': 'weight'}),
source='entity1', target='entity2', edge_attr=True)
pos = nx.kamada_kawai_layout(graph, weight='weight')
_ = plt.figure(figsize=(20, 20))
nx.draw(graph, pos,
node_size=1000,
node_color='skyblue',
alpha=0.8,
with_labels = True,
font_size = 10)
plt.title('Graph Visualization', size=15)
for (node1,node2,data) in graph.edges(data=True):
width = data['weight']
_ = nx.draw_networkx_edges(graph,pos,
edgelist=[(node1, node2)],
width=width,
edge_color='#505050',
alpha=0.5)
plt.show()
```
## How to Do Spacy + Snorkel together Example
Spacy Pipeline Addition
- Based on the chosen text, add entities to a default spacy model. (entity_ruler)
- Add a norm_entity, merge_entity, and init_coref pipelines.
- Update and add the alias lookup if necessary for the data.
- Add the name resolver pipeline.
- Create a co-occurrence graph of the entities linked together in your text.
### Entity ruler - adding entities
```
# we want to update the spacy training before we use it in processed = nlp(x) in this function
from spacy.pipeline import EntityRuler
incorrect_tags = ["One", "Trinity", "Morpheus"]
# dictionary with label and pattern is the rules
patterns = [{"label": "PERSON", # whatever spacy calls it
# put in the regex
"pattern": [{"TEXT": "The", "OP": "?"}, # The One but the THE is optional
{"TEXT": {"IN": incorrect_tags}}
]
}
]
print(patterns)
ruler = nlp.add_pipe('entity_ruler', before='ner')
ruler.add_patterns(patterns)
```
### Norm entities - not much to do here
```
from spacy.tokens import Span
from spacy import Language
@Language.component("norm_entities")
def norm_entities(doc):
ents = []
for ent in doc.ents:
if ent[0].pos_ == "DET": # leading article
ent = Span(doc, ent.start+1, ent.end, label=ent.label)
if len(ent) > 0:
if ent[-1].pos_ == "PART": # trailing particle like 's
ent = Span(doc, ent.start, ent.end-1, label=ent.label)
ents.append(ent)
doc.ents = tuple(ents)
return doc
nlp.add_pipe('norm_entities')
```
### Merge entities - do not need to do much here
```
from spacy.pipeline import merge_entities
if nlp.has_pipe('merge_entities'): ###
_ = nlp.remove_pipe('merge_entities') ###
nlp.add_pipe('merge_entities')
```
### Initial Coreference
Here we need to change the entity types we are interested in
```
# not in book, but useful if you modify the extension
from spacy.tokens import Token
if Token.has_extension('ref_n'):
_ = Token.remove_extension('ref_n')
if Token.has_extension('ref_t'):
_ = Token.remove_extension('ref_t')
if Token.has_extension('ref_t_'):
_ = Token.remove_extension('ref_t_')
from spacy.tokens import Token
Token.set_extension('ref_n', default='') #ref name
Token.set_extension('ref_t', default='') #ref type
@Language.component("init_coref")
def init_coref(doc):
for e in doc.ents:
if e.label_ in ['PERSON']:
e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
return doc
nlp.add_pipe("init_coref")
```
### Name Resolver
Here we deal with upper case versus lower case and then things in parentheses if necessary
```
def name_match(m1, m2):
m2 = re.sub(r'[()\.]', '', m2).lower()
# ignore parentheses and dots and lower case
# so if m1 and m2 are both the same word after lower casing, return true
m1 = m1.lower()
m2 = r'\b' + m2 + r'\b' # \b marks word boundary
m2 = re.sub(r'\s+', r'\\b.*\\b', m2)
return re.search(m2, m1, flags=re.I) is not None
@Language.component("propagate_ent_type")
def propagate_ent_type(doc):
"""propagate entity type stored in ref_t"""
ents = []
for e in doc.ents:
if e[0]._.ref_n != '': # if e is a coreference
e = Span(doc, e.start, e.end, label=e[0]._.ref_t)
ents.append(e)
doc.ents = tuple(ents)
return doc
@Language.component("name_resolver")
def name_resolver(doc):
"""create name-based reference to e1 as primary mention of e2"""
ents = [e for e in doc.ents if e.label_ in ['PERSON']]
for i, e1 in enumerate(ents):
for e2 in ents[i+1:]:
if name_match(e1[0]._.ref_n, e2[0].text):
e2[0]._.ref_n = e1[0]._.ref_n
e2[0]._.ref_t = e1[0]._.ref_t
return propagate_ent_type(doc)
nlp.add_pipe('name_resolver')
```
```
# first get the entities, must be two for relationship matches
# line 3, create a function to grab the entities so we can use apply later
def get_entities(x):
"""
Grabs the names using spacy's entity labeler
"""
# get all the entities in this row
processed = nlp(x)
# get the tokens for each sentence
tokens = [word.text for word in processed]
# get all the entities
temp = [(str(ent), ent.label_) for ent in processed.ents if ent.label_ != ""]
# only move on if this row has at least two
if len(temp) > 1:
# finds all the combinations of pairs
temp2 = list(combinations(temp, 2))
# for each pair combination
for (person1, person2) in temp2:
# find the names in the person 1
person1_words = [word.text for word in nlp(person1[0])]
# find the token numbers for person 1
person1_ids = [i for i, val in enumerate(tokens) if val in person1_words]
# output in (start, stop) token tuple format
if len(person1_words) > 1:
person1_ids2 = tuple(idx for idx in person1_ids[0:2])
else:
id_1 = [idx for idx in person1_ids]
person1_ids2 = (id_1[0], id_1[0])
# do the same thing with person 2
person2_words = [word.text for word in nlp(person2[0])]
person2_ids = [i for i, val in enumerate(tokens) if val in person2_words[0:2]]
if len(person2_words) > 1:
person2_ids2 = tuple(idx for idx in person2_ids[0:2])
else:
id_2 = [idx for idx in person2_ids]
person2_ids2 = (id_2[0], id_2[0])
# store all this in a list
stored_entities.append(
[x, # original text
tokens, # tokens
person1[0], # person 1 name
person2[0], # person 2 name
person1_ids2, # person 1 id token tuple
person2_ids2 # person 2 id token tuple
])
DF.columns
```
```
# create an empty spot to save the data
stored_entities = []
DF['clean'].apply(get_entities)
# create dataframe in snorkel structure
DF_dev = pd.DataFrame(stored_entities,
columns = ["sentence", "tokens", "entity1",
"entity2", "entity1_word_idx", "entity2_word_idx"])
DF_dev
```
# Text Summarization
## Libraries
```{python}
import pandas as pd
import PyPDF2
import re
import nltk
from sentence_transformers import SentenceTransformer
import faiss
import time
# define a search
def search(query, model, text_list):
t=time.time()
query_vector = model.encode([query])
k = 5
top_k = index.search(query_vector, k)
print('totaltime: {}'.format(time.time()-t))
return [text_list[_id] for _id in top_k[1].tolist()[0]]
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lsa import LsaSummarizer
LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import pyLDAvis
import pyLDAvis.gensim_models #don't skip this
import matplotlib.pyplot as plt
def preprocess(textstring):
stops = set(stopwords.words('english'))
tokens = word_tokenize(textstring)
return [token.lower() for token in tokens if token.isalpha()
and token not in stops]
from rouge_score import rouge_scorer
def print_rouge_score(rouge_score):
for k,v in rouge_score.items():
print (k, 'Precision:', "{:.2f}".format(v.precision), 'Recall:', "{:.2f}".format(v.recall), 'fmeasure:', "{:.2f}".format(v.fmeasure))
```
## Find Text
- Wireland ranch episode 1 podcast
- import the text
- break into sentences
- put into pandas
- see if we need to clean
```
# creating a pdf file object
pdfFileObj = open('wireland_ranch.pdf', 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfReader(pdfFileObj)
# printing number of pages in pdf file
len(pdfReader.pages)
# creating a page object
pageObj = pdfReader.pages
# extracting text from page
# loop here to get it all
text = []
for page in pageObj:
page = re.sub("\\n", " ", page.extract_text())
text.append(page)
print(text[0])
```
```
text = ' '.join(text)
sentences = nltk.sent_tokenize(text)
len(sentences)
DF = pd.DataFrame(sentences, columns = ['sentence'])
DF.head()
```
Create A Search Engine
- Using each sentence as your “documents”, create a search engine to find specific pieces of text.
```
# only need to run this thing once and once it is
# saved, you can "turn off" the chunk using eval = F in
# Rstudio, or change the code type to markdown to save
# the code for yourself in datalore but not run it
# Load a pre-trained model
model = SentenceTransformer('msmarco-MiniLM-L-12-v3')
wireland_embed = model.encode(DF['sentence'].to_list()) #same as sentences, but helps to have a DF in case you needed to do other cleaning
# Create an index using FAISS
index = faiss.IndexFlatL2(wireland_embed.shape[1])
index.add(wireland_embed)
faiss.write_index(index, 'index_wireland_reviews')
```
Search for several items.
```
# read in the index later when you need to use this again
index = faiss.read_index('index_wireland_reviews')
# you do have to have the model open too
model = SentenceTransformer('msmarco-MiniLM-L-12-v3')
search("overseer", model, DF['sentence'].to_list(), 5)
search("crime", model, DF['sentence'].to_list(), 5)
search("delivery", model, DF['sentence'].to_list(), 5)
```
Examine the results and comment on how well you think the search engine worked.
ANSWER THIS QUESTION
Create Text Summaries
Create a human summary of the text.
```
human_summary = "A crumbling shack in the Mojave desert houses the heartbeat at the center of the universe. Long ago, the heart was diminished to a parasite when new spoiled gods built of the more distasteful human energies usurped the throne and began their own type of reign. They battled and bickered and those arguments translated to our world in the form of tragedies, mass rituals, and monied black magic. Now, it seems that history is coming to a head and our spoiled gods are fighting harder than they have ever fought before. Dead in the center of all of this are two humans. One an unwitting delivery driver turned host for the parasitic heart. The other a disgraced drug addicted cop who went searching for the driver on behalf of his family. Everything else, we will discover together."
chatgpt_summary = "Episode 1: The Return of the Overseer follows the story of a delivery driver who receives an unusual order that takes him to Reynold’s Limited Curiosities, a mysterious shop. The driver encounters a strange, sentient desk in the shop, which is inhabited by a sphinx-like creature. Despite warnings from an enigmatic voice, the driver approaches the desk and is attacked by the creature, only to be saved by multicolored worms that emerge from the light fixtures. The creature is destroyed, and a glowing figure appears before the driver, guiding him to a room filled with jars containing strange contents. The figure disappears, leaving the driver bewildered. He quickly grabs his delivery and flees the shop, experiencing strange phenomena and feeling disconnected from reality."
```
- Create text summaries using LSA, TextRank, and Topic Modeling.
### Text Rank
```
num_summary_sentences = 5
# be sure to put in one big long string
# this will parse things into sentences for summarization
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
# builds a summarizer with a stemmer (which grabs english from above)
summarizer = TextRankSummarizer(stemmer)
# add the stops for the language we set (english)
summarizer.stop_words = get_stop_words(LANGUAGE)
tr_sum = []
for sentence in summarizer(parser.document, num_summary_sentences):
tr_sum.append(str(sentence))
tr_sum = " ".join(tr_sum)
tr_sum
```
### LSA
```
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
lsa_sum = []
for sentence in summarizer(parser.document, num_summary_sentences):
lsa_sum.append(str(sentence))
lsa_sum = " ".join(lsa_sum)
lsa_sum
```
### Topic Modeling
```
# remember all the stuff from earlier that was loaded
# Create a dictionary representation of the documents.
# use our list of sentences from earlier
processed_sentences = [preprocess(sent) for sent in sentences]
# create the vocabulary list
dictionary = Dictionary(processed_sentences)
# convert to a term by document matrix
corpus = [dictionary.doc2bow(sent) for sent in processed_sentences]
# Train the topic model
LDAmodel = LdaModel(corpus = corpus,
id2word = dictionary,
iterations = 400,
num_topics = 10,
random_state = 100,
update_every = 1,
chunksize = 100,
passes = 10,
alpha = 'auto',
per_word_topics = True)
probs = [LDAmodel.get_document_topics(sentence) for sentence in corpus]
save_probs = []
i = 0 # looping variable
for document in probs:
for (topic, prob) in document:
if topic == 0: # this is the topic zero but you can pick another one
save_probs.append((sentences[i], prob))
i = i + 1
DF = pd.DataFrame(save_probs, columns = ["sentence", "prob"])
topic_sum = " ".join(DF.sort_values(by = ["prob"], ascending = False)[0:num_summary_sentences].sentence)
topic_sum
```
- Assess those summaries using the Rouge-N analyzer.
```
# build a blank model
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
# add the gold standard and summary you want to compare
# scores = scorer.score(gold_standard, summary)
# print the scores
# print_rouge_score(scores)
# compare to overall podcast paragraph
print_rouge_score(scorer.score(human_summary, tr_sum))
print_rouge_score(scorer.score(human_summary, lsa_sum))
print_rouge_score(scorer.score(human_summary, topic_sum))
# compare to chat gpt
print_rouge_score(scorer.score(chatgpt_summary, tr_sum))
print_rouge_score(scorer.score(chatgpt_summary, lsa_sum))
print_rouge_score(scorer.score(chatgpt_summary, topic_sum))
```
- Which summary was the best when compared to the human summary?
ANSWER THIS QUESTION
- Visualization of topic models.
```
vis = pyLDAvis.gensim_models.prepare(LDAmodel, corpus, dictionary, n_jobs = 1)
pyLDAvis.save_html(vis, 'LDA_Visualization.html') ##saves the file
```
# Classification
## Libraries
```
import pysrt
import pandas as pd
import nltk
#nltk.download("stopwords") # only the first time
#nltk.download("punkt")
from nltk.corpus import stopwords
mystopwords = set(stopwords.words("english"))
import spacy
import subprocess
print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
nlp = spacy.load("en_core_web_sm")
import contractions
# function
def data_clean(text):
text = text.lower() # lower case
# punctuation ?
# contractions
text = contractions.fix(text)
# stop words
text = " ".join([word for word in nltk.word_tokenize(text) if word not in mystopwords])
# lemmatization
temp = nlp(text)
text = " ".join([word.lemma_ for word in temp])
return(text)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
# our flattening function from last time
import numpy as np
def document_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index_to_key)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import eli5
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
```
## Data
```
subs = pysrt.open("airbud.srt")
DF = pd.DataFrame([
{
"Text": sub.text,
"Start": sub.start.seconds,
"End": sub.end.seconds
} for sub in subs])
DF.head()
feel_DF = pd.read_csv("Emotion_classify_Data.csv")
```
## Set up Text
- Do preprocessing on your text to prepare it for the final machine learning model.
```
DF['clean'] = DF['Text'].apply(data_clean)
feel_DF['clean'] = feel_DF['review'].apply(data_clean)
```
- What items do you think will be important in your preprocessing to clean?
ANSWER THIS QUESTION
```
DF.to_csv("airbud_process.csv")
feel_DF.to_csv("imdb_process.csv")
```
```
DF = pd.read_csv("airbud_process.csv")
feel_DF = pd.read_csv("imdb_process.csv")
```
## Split the Modeling Data
```
# you can have mulitple arguments on the left side of a =
X_train, X_test, Y_train, Y_test = train_test_split(feel_DF['clean'], # X values
feel_DF['sentiment'], # Y values
test_size = 0.2, # test size
random_state = 89543, # random shuffle
stratify = feel_DF['sentiment'])
print(X_train.head())
print(Y_train.head())
```
## Create Feature Extractions
- Create a “one-hot” encoding using the count vectorizer and binary options.
```
# count vectorizer - one hot
# create a blank extractor
one_hot = CountVectorizer(binary=True)
# fit the data to it
oh_train = one_hot.fit_transform(X_train)
# transform the second data to it matches to the fit_transform vocab
oh_test = one_hot.transform(X_test)
print(oh_train.shape)
print(oh_test.shape)
```
- Create the bag of words encoding using the count vectorizer.
```
# create a blank extractor
bow = CountVectorizer()
# fit the data to it
bow_train = bow.fit_transform(X_train)
# transform the second data to it matches to the fit_transform vocab
bow_test = bow.transform(X_test)
print(bow_train.shape)
print(bow_test.shape)
```
- Create the TF-IDF normalization using the tfidf vectorizer.
```
# create a blank extractor
tfidf = TfidfVectorizer()
# fit the data to it
tf_train = tfidf.fit_transform(X_train)
# transform the second data to it matches to the fit_transform vocab
tf_test = tfidf.transform(X_test)
print(tf_train.shape)
print(tf_test.shape)
```
- Create two word2vec models:
- Use a large number of dimensions that matches your tfidf.
- Using cbow and skipgram embeddings.
- Using a 5 window size.
```
wv_c = Word2Vec(X_train,
vector_size = 500, #dimensions
window = 5, #window size
sg = 0, #cbow
min_count = 1,
workers = 4)
# generate averaged word vector features from word2vec model
wv_c_train = document_vectorizer(corpus = X_train,
model = wv_c,
num_features = 500)
# generate averaged word vector features from word2vec model
wv_c_test = document_vectorizer(corpus = X_test,
model = wv_c,
num_features = 500)
print(wv_c_train.shape)
print(wv_c_test.shape)
```
```
wv_s = Word2Vec(X_train,
vector_size = 500, #dimensions
window = 5, #window size
sg = 1, #cbow
min_count = 1,
workers = 4)
# generate averaged word vector features from word2vec model
wv_s_train = document_vectorizer(corpus = X_train,
model = wv_s,
num_features = 500)
# generate averaged word vector features from word2vec model
wv_s_test = document_vectorizer(corpus = X_test,
model = wv_s,
num_features = 500)
print(wv_s_train.shape)
print(wv_s_test.shape)
```
## What is the OOV?
```
# original vocabulary
train_vocab = set(one_hot.get_feature_names_out())
one_hot2 = CountVectorizer(binary=True)
oh_test = one_hot2.fit_transform(X_test)
test_vocab = set(one_hot2.get_feature_names_out())
# test_vocab is got what that training doesn't
len(test_vocab.difference(train_vocab))
#test_vocab.difference(train_vocab)
```
## Logistic Regression
```
# log regression
# build a blank model
logreg = LogisticRegression(max_iter = 10000)
# fit the training data to the model
logreg.fit(oh_train, Y_train)
# predict test cases
y_pred = logreg.predict(oh_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
```{python}
# log regression
# build a blank model
logreg = LogisticRegression(max_iter = 10000)
# fit the training data to the model
logreg.fit(bow_train, Y_train)
# predict test cases
y_pred = logreg.predict(bow_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
```{python}
# log regression
# build a blank model
logreg = LogisticRegression(max_iter = 10000)
# fit the training data to the model
logreg.fit(tf_train, Y_train)
# predict test cases
y_pred = logreg.predict(tf_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
```
# log regression
# build a blank model
logreg = LogisticRegression(max_iter = 10000)
# fit the training data to the model
logreg.fit(wv_c_train, Y_train)
# predict test cases
y_pred = logreg.predict(wv_c_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
```
# log regression
# build a blank model
logreg = LogisticRegression(max_iter = 10000)
# fit the training data to the model
logreg.fit(wv_s_train, Y_train)
# predict test cases
y_pred = logreg.predict(wv_s_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
## Naive Bayes
```{python}
# bayes
# build a blank model
nb = MultinomialNB()
# fit the training data to the model
nb.fit(oh_train, Y_train)
# predict test cases
y_pred = nb.predict(oh_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
```{python}
# bayes
# build a blank model
nb = MultinomialNB()
# fit the training data to the model
nb.fit(bow_train, Y_train)
# predict test cases
y_pred = nb.predict(bow_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
```{python}
# bayes
# build a blank model
nb = MultinomialNB()
nb.fit(tf_train, Y_train)
# predict test cases
y_pred = nb.predict(tf_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
```{python}
# no negative predictors allowed in Bayes
wv_c_train.min()
wv_c_test.min()
wv_s_train.min()
wv_s_test.min()
wv_c_train = wv_c_train + 1
wv_c_test = wv_c_test + 1
wv_s_train = wv_s_train + 1
wv_s_test = wv_s_test + 1
```
```{python}
# bayes
# build a blank model
nb = MultinomialNB()
# fit the training data to the model
nb.fit(wv_c_train, Y_train)
# predict test cases
y_pred = nb.predict(wv_c_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
```{python}
# bayes
# build a blank model
nb = MultinomialNB()
# fit the training data to the model
nb.fit(wv_s_train, Y_train)
# predict test cases
y_pred = nb.predict(wv_s_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
## Lime
### The final model
```{python}
# build a blank model
logreg = LogisticRegression(max_iter = 10000)
# fit the training data to the model
logreg.fit(tf_train, Y_train)
# predict test cases
y_pred = logreg.predict(tf_test)
# compare predicts to the actuals
print(classification_report(y_true = Y_test, y_pred = y_pred))
```
### Apply to new instances
```{python}
# PUT IN THE PIPELINE feature extractor, final algorithm model
pipeline = make_pipeline(tfidf, logreg)
# build a blank model
explainer = LimeTextExplainer(class_names = Y_train.sort_values().unique())
id_value = 3
# first argument is text
exp = explainer.explain_instance(feel_DF.iloc[id_value]['clean'],
# predict_proba only works on certain models
pipeline.predict_proba, num_features=10)
exp.as_pyplot_figure()
plt.show()
exp.save_to_file('example.html')
```
## eli5
```{python}
eli5.show_weights(estimator = logreg, top = 10, feature_names = tfidf.get_feature_names_out())
```
## Apply to airbud
```{python}
# use the pipeline to predict your new data (that's been cleaned)
DF['answer'] = pipeline.predict(DF['clean'].to_list())
# review the output
DF['answer'].value_counts()
```
## Notes
- Often have data we want to predict but no labels
- So we found a dataset with labels to build our model
- and then we will apply the verified/tested model to the data that we actually want to predict
## Pipeline
1) Text Cleaning - you should have a function that does the cleaning, so it's always the same for dataset (both the training/modeling data and to be predicted data)
2) Apply that text cleaning to both the modeling data (imdb) and the to be predicted data (airbud) separately
a) If you have a big dataset, consider processing and saving the processed output, so you save time
b) Then you would only reload the processed data, not having to do step 1 and 2 again.
3) Split the modeling data into testing and training --> build a model and then testing it on new data to make sure it generalizes
a) Most of the data goes into training - you want to get all the vocab words into the model (if a vocab is only in the testing set, it is not included in the model)
4) Build our Feature Extractions
a) We can't use our predictor variable at the moment --> it's text ... it needs to be numbers, so let's convert it to numbers
b) build a blank extractor, fit_transform to build the vocabulary and the train data, and then transform to convert the test data to the same shape (vocabulary size)
5) Build the model that combines algorithm with feature extraction
a) Build our blank model
b) Fit the training data to the model
c) Apply that model to the testing data
d) Examine the results (classification report)
6) Pick a model that works best
a) want to look at accuracy overall
b) look at recall and precision - or the F1 score for each group
c) models that do NOT predict a specific category (i.e., you get the zero division error or f1 = 0) are not good, don't use them
7) apply that model to our new data
a) Use the pipeline feature to put together the extraction and algorithm
b) look at the final value counts for your prediction and interpret
# Chatbots
## Find Text
- Find a list of movies and themes to pick from.
```
import pandas as pd
# openpyxl - you may need to install this package
df = pd.read_excel("movie.xlsx")
user_input = "Scifi"
df[df['type'] == user_input]['recommend'].iloc[0]
```
## Train the Chatbot
- As a class, let’s train a chatbot to tell us recommend a movie based on theme we pick.
- in the terminal
```
rasa init
```
### nlu
```
version: "3.1"
nlu:
- intent: greet
examples: |
- hey
- hello
- hi
- hello there
- good morning
- good evening
- moin
- hey there
- let's go
- hey dude
- goodmorning
- goodevening
- good afternoon
- intent: goodbye
examples: |
- cu
- good by
- cee you later
- good night
- bye
- goodbye
- have a nice day
- see you around
- bye bye
- see you later
- intent: affirm
examples: |
- yes
- y
- indeed
- of course
- that sounds good
- correct
- intent: deny
examples: |
- no
- n
- never
- I don't think so
- don't like that
- no way
- not really
- intent: mood_great
examples: |
- perfect
- great
- amazing
- feeling like a king
- wonderful
- I am feeling very good
- I am great
- I am amazing
- I am going to save the world
- super stoked
- extremely good
- so so perfect
- so good
- so perfect
- intent: mood_unhappy
examples: |
- my day was horrible
- I am sad
- I don't feel very well
- I am disappointed
- super sad
- I'm so sad
- sad
- very sad
- unhappy
- not good
- not very good
- extremly sad
- so saad
- so sad
- intent: bot_challenge
examples: |
- are you a bot?
- are you a human?
- am I talking to a bot?
- am I talking to a human?
- intent: movie
examples: |
- what should I watch?
- what is a good movie?
- I am bored.
- What to watch tonight?
- Want to watch a movie?
- intent: get_recommend
examples: |
- [Comedy](what_type)
- [Western](what_type)
- [Scifi](what_type)
- [Romance](what_type)
- intent: thanks
examples: |
- thank you
- thank you very much
- thanks
- great, thanks
```
### domain
```
version: "3.1"
intents:
- greet
- goodbye
- affirm
- deny
- mood_great
- mood_unhappy
- bot_challenge
- movie
- get_recommend
- thanks
responses:
utter_greet:
- text: "Hey! How are you?"
utter_cheer_up:
- text: "Here is something to cheer you up:"
image: "https://i.imgur.com/nGF1K8f.jpg"
utter_did_that_help:
- text: "Did that help you?"
utter_happy:
- text: "Great, carry on!"
utter_goodbye:
- text: "Bye"
utter_iamabot:
- text: "I am a bot, powered by Rasa."
utter_question_movie:
- text: "What type of movie do you want to watch?"
utter_options:
- text: "You can pick Comedy, Romance, Scifi, or Western."
utter_holdon:
- text: "Please wait while I look up a recommendation."
utter_recommend:
- text: "Here's your movie:"
utter_welcome:
- text: "You're welcome!"
entities:
- what_type
slots:
what_type:
type: text
mappings:
- type: from_entity
entity: what_type
actions:
- action_get_movie
session_config:
session_expiration_time: 60
carry_over_slots_to_new_session: true
```
### stories
```
version: "3.1"
stories:
- story: happy path
steps:
- intent: greet
- action: utter_greet
- intent: mood_great
- action: utter_happy
- story: sad path 1
steps:
- intent: greet
- action: utter_greet
- intent: mood_unhappy
- action: utter_cheer_up
- action: utter_did_that_help
- intent: affirm
- action: utter_happy
- story: sad path 2
steps:
- intent: greet
- action: utter_greet
- intent: mood_unhappy
- action: utter_cheer_up
- action: utter_did_that_help
- intent: deny
- action: utter_goodbye
- story: movie recommend
steps:
- intent: movie
- action: utter_question_movie
- action: utter_options
- intent: get_recommend
- action: utter_holdon
- action: action_get_movie
- action: utter_recommend
- intent: thanks
- action: utter_welcome
```
### actions.py
```
# This files contains your custom actions which can be used to run
# custom Python code.
#
# See this guide on how to implement these action:
# https://rasa.com/docs/rasa/custom-actions
# This is a simple example for a custom action which utters "Hello World!"
from typing import Any, Text, Dict, List
from rasa_sdk import Action, Tracker
from rasa_sdk.executor import CollectingDispatcher
class ActionGetMovie(Action):
def name(self) -> Text:
return "action_get_movie"
def run(self, dispatcher: CollectingDispatcher,
tracker: Tracker,
domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:
# see if they entered something correct
what_type = tracker.get_slot('what_type') # what they wrote
types = ["Comedy", "Scifi", "Romance", "Western"] #options
import pandas as pd #our background stuff
df = pd.read_excel("movie.xlsx")
# see if it lines up
if (what_type in types):
output = df[df['type'] == what_type]['recommend'].iloc[0]
else:
output = "You did not enter a movie type I know about."
dispatcher.utter_message(text=output)
return []
```
### endpoints
```
# This file contains the different endpoints your bot can use.
# Server where the models are pulled from.
# https://rasa.com/docs/rasa/model-storage#fetching-models-from-a-server
#models:
# url: http://my-server.com/models/default_core@latest
# wait_time_between_pulls: 10 # [optional](default: 100)
# Server which runs your custom actions.
# https://rasa.com/docs/rasa/custom-actions
action_endpoint:
url: "http://localhost:5055/webhook"
# Tracker store which is used to store the conversations.
# By default the conversations are stored in memory.
# https://rasa.com/docs/rasa/tracker-stores
#tracker_store:
# type: redis
# url: <host of the redis instance, e.g. localhost>
# port: <port of your redis instance, usually 6379>
# db: <number of your database within redis, e.g. 0>
# password: <password used for authentication>
# use_ssl: <whether or not the communication is encrypted, default false>
#tracker_store:
# type: mongod
# url: <url to your mongo instance, e.g. mongodb://localhost:27017>
# db: <name of the db within your mongo instance, e.g. rasa>
# username: <username used for authentication>
# password: <password used for authentication>
# Event broker which all conversation events should be streamed to.
# https://rasa.com/docs/rasa/event-brokers
#event_broker:
# url: localhost
# username: username
# password: password
# queue: queue
```
## Test the Chatbot
- Test the chatbot responses.
```
rasa train && rasa shell
```
```
rasa run actions
```
Turn in: take a screenshot of your terminal talking to the bot asking for a movie recommendation.