## these are all the modules we imported in CTH
import pandas as pd
import matplotlib.pyplot as plt
import string
import nltk
import nltk.corpus
from nltk.corpus import stopwords
nltk.download("stopwords")
from collections import Counter
## Import the dataset
file_path = "dataset/GVA2020.csv"
df_gva = pd.read_csv(file_path, encoding="utf-8")
## Set/remove headers --> Change date -> datetime:
df_gva.columns = ["id", "date", "state", "place", "address", "killed", "injured", "operations"]
df_gva = df_gva[["date", "state", "place", "address", "killed", "injured"]]
df_gva["date"] = pd.to_datetime(df_gva["date"])
<!-- # Normalization
for i in splitted_text:
i = i.lower()
i = i.translate(str.maketrans('', '', string.punctuation))
i = re.sub(r'\d+', '', i)
i = re.sub(' +', ' ', i)
i = i.replace("\n", " ")
i = i.split() -->
# Leon released a "model assignment" as an example where he shared a normalization function, we could also use that?
def normalize(text, function_words=[]):
"""
Normalize the text and return it.
"""
text = text.lower()
for char in string.punctuation:
text = text.replace(char, "")
if function_words:
text = remove_function_words(text, function_words)
return text
def remove_function_words(text, function_words):
"""
Remove function words from a text
"""
normalized_tokens = []
for word in text.split():
if word in function_words:
continue
else:
normalized_tokens.append(word)
normalized_text = " ".join(normalized_tokens)
return normalized_text
# This is the normalization function i used:
def normalize(text):
normalized_text = text.lower()
for char in string.punctuation:
normalized_text = normalized_text.replace(char, "")
return normalized_text
## then i did this to use it on the dataframe:
df_news["normalized_article"] = df_news["article"].apply(normalize)
## and finally to make the .txt i did this:
with open("dataset/article1.txt", "w", encoding="utf-8") as outfile:
for i in df_news["normalized_article"][1:2]:
outfile.write(i)