## these are all the modules we imported in CTH import pandas as pd import matplotlib.pyplot as plt import string import nltk import nltk.corpus from nltk.corpus import stopwords nltk.download("stopwords") from collections import Counter ## Import the dataset file_path = "dataset/GVA2020.csv" df_gva = pd.read_csv(file_path, encoding="utf-8") ## Set/remove headers --> Change date -> datetime: df_gva.columns = ["id", "date", "state", "place", "address", "killed", "injured", "operations"] df_gva = df_gva[["date", "state", "place", "address", "killed", "injured"]] df_gva["date"] = pd.to_datetime(df_gva["date"]) <!-- # Normalization for i in splitted_text: i = i.lower() i = i.translate(str.maketrans('', '', string.punctuation)) i = re.sub(r'\d+', '', i) i = re.sub(' +', ' ', i) i = i.replace("\n", " ") i = i.split() --> # Leon released a "model assignment" as an example where he shared a normalization function, we could also use that? def normalize(text, function_words=[]): """ Normalize the text and return it. """ text = text.lower() for char in string.punctuation: text = text.replace(char, "") if function_words: text = remove_function_words(text, function_words) return text def remove_function_words(text, function_words): """ Remove function words from a text """ normalized_tokens = [] for word in text.split(): if word in function_words: continue else: normalized_tokens.append(word) normalized_text = " ".join(normalized_tokens) return normalized_text # This is the normalization function i used: def normalize(text): normalized_text = text.lower() for char in string.punctuation: normalized_text = normalized_text.replace(char, "") return normalized_text ## then i did this to use it on the dataframe: df_news["normalized_article"] = df_news["article"].apply(normalize) ## and finally to make the .txt i did this: with open("dataset/article1.txt", "w", encoding="utf-8") as outfile: for i in df_news["normalized_article"][1:2]: outfile.write(i)