---
tags: Maker Scraper
---
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
from itertools import chain
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
# Scrape ALL posts
def scrolling(driver):
# Get scroll height.
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to the bottom.
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load the page (seconds).
time.sleep(10)
# Calculate new scroll height and compare with last scroll height.
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def scrape_all_posts():
MAP = {
'gov-001': 'https://forum.makerdao.com/tag/gov-001',
'sf-001': 'https://forum.makerdao.com/tag/sf-001',
'ces-001': 'https://forum.makerdao.com/tag/ces-001',
'sas-001': 'https://forum.makerdao.com/tag/sas-001',
'risk-001': 'https://forum.makerdao.com/tag/risk-001',
'tech-001': 'https://forum.makerdao.com/tag/tech-001',
# 'sh-001': 'https://forum.makerdao.com/tag/sh-001',
'gro-001': 'https://forum.makerdao.com/tag/gro-001',
# 'rwf-001': 'https://forum.makerdao.com/tag/rwf-001',
'com-001': 'https://forum.makerdao.com/tag/com-001',
'dux-001': 'https://forum.makerdao.com/tag/dux-001',
'sne-001': 'https://forum.makerdao.com/tag/sne-001',
'deco-001': 'https://forum.makerdao.com/tag/deco-001',
# 'events-001': 'https://forum.makerdao.com/tag/events-001',
'pe-001': 'https://forum.makerdao.com/tag/pe-001',
'ses-001': 'https://forum.makerdao.com/tag/ses-001',
'is-001': 'https://forum.makerdao.com/tag/is-001',
'daif-001': 'https://forum.makerdao.com/tag/daif-001',
'din-001': 'https://forum.makerdao.com/tag/din-001',
'ora-001': 'https://forum.makerdao.com/tag/ora-001'
}
for doc in MAP.keys():
url = MAP[doc]
# setting driver options
options = webdriver.FirefoxOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
# driver instance
driver = webdriver.Firefox(options=options)
driver.get(url)
# infinite scroll
scrolling(driver)
page_source = driver.page_source # raw html
page = page_source.encode('utf-8') # encoding for soup
soup = BeautifulSoup(page, 'html.parser')
links = []
data = []
# grabbing top and bottom line post/tag links
top_line = [a['href'] for a in soup.findAll(
'a', class_='title') if a.stripped_strings] # first page post links
for line in top_line:
links.append(line)
for alldata in soup.findAll(re.compile("td")):
raw = alldata.text.split(',')
cc = list(map(lambda a: a.strip(), raw))
cl = ' '.join(cc).splitlines()
new_list = [e for e in cl if e.strip()]
for i in new_list:
data.append(i.split(','))
dates = [] # get js dates
dts = driver.find_elements(By.XPATH, '//*/tr[*]/td[5]')
for date in dts:
# assign newline to separator char
dates.append(date.get_attribute('title').replace('\n', ' | '))
clean_dates = [] # strip unwated text
for date in dates:
sep = '|' # remove everthing after separators
sep2 = ':'
dt = date.split(sep, 1)[0] # strip last post time
dtstr = dt.replace('First post: ', '')
dtt = dtstr.split(sep2, 1)[0] # strip h:mm, unwanted end chars
rinsed = dtt.rstrip(dtt[-1]) # strip last char
final_dts = rinsed.rstrip('1') # strip leftovers
final_dts = final_dts.rstrip(' ')
clean_dates.append(final_dts.splitlines())
# close driver instance
driver.close()
if True:
print("Driver closed.")
else:
print('error.')
# start:stop:step to splice data
titles = data[0::6]
mainTag = data[1::6]
subTags = data[2::6]
replies = data[3::6]
views = data[4::6]
dates = data[5::6]
links = links[0:]
# cleaning columns
sub_tags_final = []
for item in subTags:
i = ' '.join(item)
sub_tags_final.append(i.split())
df = pd.DataFrame({
'titles': titles,
'mainTag': mainTag,
'subTags': sub_tags_final,
'replies': replies,
'views': views,
'links': links
})
# cast to string to strip unwanted bracket elements
df['titles'] = df['titles'].astype(str).str.strip('[]')
df['mainTag'] = df['mainTag'].astype(str).str.strip('[]')
df['subTags'] = df['subTags'].astype(str).str.strip('[]')
df['replies'] = df['replies'].astype(str).str.strip('[]')
df['views'] = df['views'].astype(str).str.strip('[]')
df['links'] = df['links'].astype(str).str.strip('[]')
# strip quotes
df = df.replace("'", '', regex=True)
# cast to proper dtypes, split dates
df['replies'] = df['replies'].replace(r'[k]+$', '', regex=True).astype(float) * \
df['replies'].str.extract(
r'[\d\.]+([k]+)', expand=False).fillna(1).replace(['k'], [10**3]).astype(int)
df['views'] = df['views'].replace(r'[k]+$', '', regex=True).astype(float) * \
df['views'].str.extract(
r'[\d\.]+([k]+)', expand=False).fillna(1).replace(['k'], [10**3]).astype(int)
# subTags split and clean
df['subTags'] = df['subTags'].astype(str).str.split(',')
df['subTags'] = df['subTags'].astype(str).str.strip('[]')
df['subTags'] = df['subTags'].replace("'", '', regex=True).replace(' ', ' ', regex=True)
# date df to append to main
df2 = pd.DataFrame({'dates': clean_dates})
# cleaning dates for framing
dates = df2['dates']
dates = dates.astype(str).str.strip('[]')
dates = dates.replace("'", '', regex=True).replace(' ', ' ', regex=True)
# merge dfs
merged_df = df.assign(Dates=dates)
# match all index lengths with NaN if necessary
if len(merged_df['dates']) < len(merged_df['titles']):
merged_df['dates'] += (len(merged_df['titles']) - len(merged_df['dates'])) * ['Archived']
# print_full(merged_df)
current_date = f'all_posts{datetime.datetime.now().strftime("%m.%d.%Y")}.csv'
merged_df.to_csv(current_date, mode='a', encoding='utf-8')
yield merged_df