It is common to write scripts that parse webpages for the text and information they contain. These scripts are called web scrapers. This is a great way to create your own corpra for NLP tasks!
Here is a simple web scraper that recursively collects content from wiki articles (this has been tested with Wikipedia and wikis hosted on Fandom.com, but others may work as well) To run, you must first install a few pip packages and run a few commands:
Windows:
ββββ> pip install nltk
ββββ> pip install beautifulsoup4
ββββ> python
ββββ>>> import nltk
ββββ>>> nltk.download('punkt')
Mac:
ββββ> pip3 install nltk
ββββ> pip3 install beautifulsoup4
ββββ> python3
ββββ>>> import nltk
ββββ>>> nltk.download('punkt')
Here is the scraper (copy and paste into file called wiki_scraper.py
)
ββββfrom nltk.tokenize import word_tokenize
ββββfrom urllib.parse import urlparse
ββββfrom bs4 import BeautifulSoup
ββββimport requests
ββββimport codecs
ββββimport random
ββββimport re
ββββimport sys
ββββdef parse_wikipedia_rec(url:str, depth:int, rand_links):
ββββ print(url)
ββββ u = urlparse(url)
ββββ depth -= 1
ββββ links = set()
ββββ corpus = []
ββββ result = requests.get(url).text
ββββ doc = BeautifulSoup(result, "html.parser")
ββββ output = doc.find(class_="mw-parser-output")
ββββ for empty in output.find_all("p", class_='mw-empty-elt'):
ββββ empty.extract()
ββββ paragraphs = output.find_all("p")
ββββ for paragraph in paragraphs:
ββββ for sup in paragraph.find_all('sup'):
ββββ sup.extract()
ββββ hrefs = paragraph.find_all("a", href=re.compile("/wiki/"))
ββββ for href in hrefs:
ββββ links.add(href['href'])
ββββ txt = paragraph.get_text()
ββββ tokens = word_tokenize(txt)
ββββ tokens.insert(0, '<START>')
ββββ tokens.append('<END>')
ββββ corpus.append(tokens)
ββββ links = list(links)
ββββ if(depth > 0):
ββββ for link in random.sample(links, rand_links):
ββββ corpus.append(parse_wikipedia_rec(
ββββ u.scheme + "://" + u.netloc+link,
ββββ depth, rand_links))
ββββ return corpus
ββββif(len(sys.argv) != 5):
ββββ print("Usage: wiki_scraper.py [output file] [wiki url] [max depth] [links at each depth]")
ββββ exit(1)
ββββcorpus = parse_wikipedia_rec(sys.argv[2], int(sys.argv[3]), int(sys.argv[4]))
ββββwith codecs.open(sys.argv[1], 'w', 'utf-8') as f:
ββββ f.write(str(corpus))
If you would like to scrape your own web pages, I would recommend following the BeautifulSoup tutorial series by Tech With Tim. He goes into detail on how to look through the HTML source of websites and programmatically parse them for whatever content you need.
For our purposes, the final format should be a list of paragraphs where each paragraph is a list of words and punctuation beginning with '<START>'
and ending with '<END>'
. For example:
ββββ[['<START>', 'japan', 'to', 'revise', 'long', '-', 'term', 'energy', 'demand', 'downwards', 'the',
ββββ'ministry', 'of', 'international', 'trade', 'and', 'industry', '(', 'miti', ')', 'will', 'revise',
ββββ'its', 'long', '-', 'term', 'energy', 'supply', '/', 'demand', 'outlook', 'by', 'august', 'to',
ββββ'meet', 'a', 'forecast', 'downtrend', 'in', 'japanese', 'energy', 'demand', ',', 'ministry',
ββββ'officials', 'said', '.', 'miti', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for',
ββββ'primary', 'energy', 'supplies', 'in', 'the', 'year', '2000', 'to', '550', 'mln', 'kilolitres',
ββββ'(', 'kl', ')', 'from', '600', 'mln', ',', 'they', 'said', '.', 'the', 'decision', 'follows',
ββββ'the', 'emergence', 'of', 'structural', 'changes', 'in', 'japanese', 'industry', 'following',
ββββ'the', 'rise', 'in', 'the', 'value', 'of', 'the', 'yen', 'and', 'a', 'decline', 'in', 'domestic',
ββββ'electric', 'power', 'demand', '.', 'miti', 'is', 'planning', 'to', 'work', 'out', 'a', 'revised',
ββββ'energy', 'supply', '/', 'demand', 'outlook', 'through', 'deliberations', 'of', 'committee',
ββββ'meetings', 'of', 'the', 'agency', 'of', 'natural', 'resources', 'and', 'energy', ',', 'the',
ββββ'officials', 'said', '.', 'they', 'said', 'miti', 'will', 'also', 'review', 'the', 'breakdown',
ββββ'of', 'energy', 'supply', 'sources', ',', 'including', 'oil', ',', 'nuclear', ',', 'coal', 'and',
ββββ'natural', 'gas', '.', 'nuclear', 'energy', 'provided', 'the', 'bulk', 'of', 'japan', "'", 's',
ββββ'electric', 'power', 'in', 'the', 'fiscal', 'year', 'ended', 'march', '31', ',', 'supplying',
ββββ'an', 'estimated', '27', 'pct', 'on', 'a', 'kilowatt', '/', 'hour', 'basis', ',', 'followed',
ββββ'by', 'oil', '(', '23', 'pct', ')', 'and', 'liquefied', 'natural', 'gas', '(', '21', 'pct', '),',
ββββ'they', 'noted', '.', '<END>'], ... ]