---
tags: Maker Scraper
---
# Update DB
import requests
import numpy as np
import pandas as pd
import datetime
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
import re
import time
MAP = {
'gov-001': 'https://forum.makerdao.com/tag/gov-001',
'sf-001': 'https://forum.makerdao.com/tag/sf-001',
'ces-001': 'https://forum.makerdao.com/tag/ces-001',
'sas-001': 'https://forum.makerdao.com/tag/sas-001',
'risk-001': 'https://forum.makerdao.com/tag/risk-001',
'tech-001': 'https://forum.makerdao.com/tag/tech-001',
# 'sh-001': 'https://forum.makerdao.com/tag/sh-001',
'gro-001': 'https://forum.makerdao.com/tag/gro-001',
# 'rwf-001': 'https://forum.makerdao.com/tag/rwf-001',
'com-001': 'https://forum.makerdao.com/tag/com-001',
'dux-001': 'https://forum.makerdao.com/tag/dux-001',
'sne-001': 'https://forum.makerdao.com/tag/sne-001',
'deco-001': 'https://forum.makerdao.com/tag/deco-001',
# 'events-001': 'https://forum.makerdao.com/tag/events-001',
'pe-001': 'https://forum.makerdao.com/tag/pe-001',
'ses-001': 'https://forum.makerdao.com/tag/ses-001',
'is-001': 'https://forum.makerdao.com/tag/is-001',
'daif-001': 'https://forum.makerdao.com/tag/daif-001',
'din-001': 'https://forum.makerdao.com/tag/din-001',
'ora-001': 'https://forum.makerdao.com/tag/ora-001'
}
for doc in MAP.keys():
url = MAP[doc]
res = requests.get(url).text
soup = BeautifulSoup(res, 'html.parser')
links = []
data = []
# grabbing top and bottom line post/tag links
top_line = [a['href'] for a in soup.findAll(
'a', class_='title') if a.stripped_strings] # first page post links
for line in top_line:
links.append(line)
for alldata in soup.findAll(re.compile("td")):
raw = alldata.text.split(',')
cc = list(map(lambda a: a.strip(), raw))
cl = ' '.join(cc).splitlines()
new_list = [e for e in cl if e.strip()]
for i in new_list:
data.append(i.split(','))
# return data, links
# start:stop:step to splice data
titles = data[0::6]
mainTag = data[1::6]
subTags = data[2::6]
replies = data[3::6]
views = data[4::6]
dates = data[5::6]
links = links[0:]
# cleaning columns
sub_tags_final = []
for item in subTags:
i = ' '.join(item)
sub_tags_final.append(i.split())
# match all index lengths with NaN if necessary
if len(dates) < len(titles):
dates += (len(titles) - len(dates)) * ['Archived']
if len(links) < len(titles):
links += (len(titles) - len(links)) * ['Archived']
df = pd.DataFrame({
'titles': titles,
'mainTag': mainTag,
'subTags': sub_tags_final,
'replies': replies,
'views': views,
'links': links
})
# cast to string to strip unwanted bracket elements
df['titles'] = df['titles'].astype(str).str.strip('[]')
df['mainTag'] = df['mainTag'].astype(str).str.strip('[]')
df['subTags'] = df['subTags'].astype(str).str.strip('[]')
df['replies'] = df['replies'].astype(str).str.strip('[]')
df['views'] = df['views'].astype(str).str.strip('[]')
df['links'] = df['links'].astype(str).str.strip('[]')
# strip quotes
df = df.replace("'", '', regex=True)
# cast to proper dtypes, split dates
df['replies'] = df['replies'].replace(r'[k]+$', '', regex=True).astype(float) * \
df['replies'].str.extract(
r'[\d\.]+([k]+)', expand=False).fillna(1).replace(['k'], [10**3]).astype(int)
df['views'] = df['views'].replace(r'[k]+$', '', regex=True).astype(float) * \
df['views'].str.extract(
r'[\d\.]+([k]+)', expand=False).fillna(1).replace(['k'], [10**3]).astype(int)
# subTags split and clean
df['subTags'] = df['subTags'].astype(str).str.split(',')
df['subTags'] = df['subTags'].astype(str).str.strip('[]')
df['subTags'] = df['subTags'].replace("'", '', regex=True).replace(' ', ' ', regex=True)
''' scrapes js hover element dates '''
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
# driver instance
driver = webdriver.Chrome(options=options)
driver.get(url)
dates = []
dts = driver.find_elements(By.XPATH, '//*/tr[*]/td[5]')
for date in dts:
# assign newline to separator char
dates.append(date.get_attribute('title').replace('\n', ' | '))
clean_dates = []
for date in dates:
sep = '|' # remove everthing after separators
sep2 = ':'
dt = date.split(sep, 1)[0] # strip last post time
dtstr = dt.replace('First post: ', '')
dtt = dtstr.split(sep2, 1)[0] # strip h:mm, unwanted end chars
rinsed = dtt.rstrip(dtt[-1]) # strip last char
final_dts = rinsed.rstrip('1') # strip remaining
final_dts = final_dts.rstrip(' ')
clean_dates.append(final_dts.splitlines())
# date df to append to main
df2 = pd.DataFrame({'dates': clean_dates})
# cleaning dates for framing
dates = df2['dates']
dates = dates.astype(str).str.strip('[]')
dates = dates.replace("'", '', regex=True).replace(' ', ' ', regex=True)
# merge all dataframes
merged_df = df.assign(Dates=dates)
print(merged_df.head())
# write to csv with current date
current_date = f'cu_{datetime.datetime.now().strftime("%m.%d.%Y")}.csv'
# merged_df.to_csv(current_date, mode='a', encoding='utf-8')