SDG-ACE Hack - HackMD

# SDG-ACE Hack ## Team 2 --- ## Imports import pandas as pd import sklearn from sklearn.feature_extraction.text import TfidfVectorizer import string import nltk as nltk nltk.download('stopwords') import numpy as np import random import warnings from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from collections import Counter from nltk.corpus import stopwords nltk.download('wordnet') nltk.download('punkt') keynames = pd.read_csv("trigramsUNforum18.csv", sep=";", header=None) ## load csv and drop columns df = pd.read_csv("2018_WoS.csv", sep=";", header=None) df['ColumnA'] = df[df.columns[1:]].apply( lambda x: ','.join(x.dropna().astype(str)), axis=1 ) df1 = df.drop(df.columns[0], axis=1, inplace=True) df2 = df.drop(df.columns[0], axis=1, inplace=True) df3 = df.drop(df.columns[0], axis=1, inplace=True) df4 = df[:1] ## tokenize & Lemmatize warnings.filterwarnings('ignore') f = open('rawfile.txt', 'r', encoding='utf8', errors='ignore') raw = df4.to_string() raw = raw.lower() sent_tokens = nltk.sent_tokenize(raw) lemmatizeSTR = nltk.stem.WordNetLemmatizer() def LemTokens(tokens): return [lemmatizeSTR.lemmatize(token) for token in tokens] remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation) def LemNormalize(text): return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict))) word_tokens = nltk.word_tokenize(raw) withoutSW = [word for word in word_tokens if not word in stopwords.words()] filtered_sentence = (" ").join(withoutSW) normal = LemNormalize(filtered_sentence) tokey = LemTokens(normal) ## compute tf-idf/ cosine similarity def response(user_response): robo_response = '' sent_tokens.append(user_response) TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english') tfidf = TfidfVec.fit_transform(normal) vals = cosine_similarity(tfidf[-1], tfidf) idx = vals.argsort()[0][-2] flat = vals.flatten() flat.sort() req_tfidf = flat[-2] if req_tfidf == 0: robo_response = robo_response + "not found in document" print(robo_response) else: robo_response = robo_response + sent_tokens[idx] print(robo_response) for index, row in enumerate(keynames): print(response('Row %d: %f' % (index + 1, sum(float(num) for num in row)))) ## count most common words Counter = Counter(tokey) most_com = Counter.most_common(10) print(most_com) ### Elsevier (247) ### Acid (196) ## Group members ## Tasks --- ## Some example HackMD formatting... *Equations can be written as LaTeX...* $$\frac{\partial {\bf u}}{\partial t} + \left({\bf u}\cdot\nabla\right){\bf u} - \nu \nabla^2 {\bf u} = -\nabla w + {\bf g}$$ or simply $x^2 + y^2 = z^2$ *Tables are simpler forms:* | Column 1 | Column 2 | Column 3 | | -------- | -------- | -------- | | Text | Text | Text | You can use <font color=blue> colour </font> if you want <font color=red> emphasis </font> and ## Add titles and page breaks ------ *Drag and drop images:* ![](https://i.imgur.com/5zauiRW.png) * Bullet 1 * Bullet 2 -- -- What are the most important