# SDG-ACE Hack
## Team 2
---
## Imports
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk as nltk
nltk.download('stopwords')
import numpy as np
import random
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
keynames = pd.read_csv("trigramsUNforum18.csv", sep=";", header=None)
## load csv and drop columns
df = pd.read_csv("2018_WoS.csv", sep=";", header=None)
df['ColumnA'] = df[df.columns[1:]].apply(
lambda x: ','.join(x.dropna().astype(str)),
axis=1
)
df1 = df.drop(df.columns[0], axis=1, inplace=True)
df2 = df.drop(df.columns[0], axis=1, inplace=True)
df3 = df.drop(df.columns[0], axis=1, inplace=True)
df4 = df[:1]
## tokenize & Lemmatize
warnings.filterwarnings('ignore')
f = open('rawfile.txt', 'r', encoding='utf8', errors='ignore')
raw = df4.to_string()
raw = raw.lower()
sent_tokens = nltk.sent_tokenize(raw)
lemmatizeSTR = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
return [lemmatizeSTR.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
word_tokens = nltk.word_tokenize(raw)
withoutSW = [word for word in word_tokens if not word in stopwords.words()]
filtered_sentence = (" ").join(withoutSW)
normal = LemNormalize(filtered_sentence)
tokey = LemTokens(normal)
## compute tf-idf/ cosine similarity
def response(user_response):
robo_response = ''
sent_tokens.append(user_response)
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = TfidfVec.fit_transform(normal)
vals = cosine_similarity(tfidf[-1], tfidf)
idx = vals.argsort()[0][-2]
flat = vals.flatten()
flat.sort()
req_tfidf = flat[-2]
if req_tfidf == 0:
robo_response = robo_response + "not found in document"
print(robo_response)
else:
robo_response = robo_response + sent_tokens[idx]
print(robo_response)
for index, row in enumerate(keynames):
print(response('Row %d: %f' % (index + 1, sum(float(num) for num in row))))
## count most common words
Counter = Counter(tokey)
most_com = Counter.most_common(10)
print(most_com)
### Elsevier (247) ### Acid (196)
## Group members
## Tasks
---
## Some example HackMD formatting...
*Equations can be written as LaTeX...*
$$\frac{\partial {\bf u}}{\partial t} + \left({\bf u}\cdot\nabla\right){\bf u} - \nu \nabla^2 {\bf u} = -\nabla w + {\bf g}$$
or simply $x^2 + y^2 = z^2$
*Tables are simpler forms:*
| Column 1 | Column 2 | Column 3 |
| -------- | -------- | -------- |
| Text | Text | Text |
You can use <font color=blue> colour </font> if you want <font color=red> emphasis </font>
and
## Add titles
and
page breaks
------
*Drag and drop images:*

* Bullet 1
* Bullet 2
--
--
What are the most important
{"metaMigratedAt":"2023-06-15T10:58:49.756Z","metaMigratedFrom":"Content","title":"SDG-ACE Hack","breaks":true,"contributors":"[{\"id\":null,\"add\":3049,\"del\":547},{\"id\":\"923b462b-e0b9-4d93-b30a-de2525f4e5eb\",\"add\":65,\"del\":26},{\"id\":\"6587d962-f75f-40c6-9b93-bb09287c706a\",\"add\":671,\"del\":4}]"}