Scrape ALL posts

--- tags: Maker Scraper --- import requests import numpy as np import pandas as pd from bs4 import BeautifulSoup import re import time from itertools import chain from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium import webdriver # Scrape ALL posts def scrolling(driver): # Get scroll height. last_height = driver.execute_script("return document.body.scrollHeight") while True: # Scroll down to the bottom. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Wait to load the page (seconds). time.sleep(10) # Calculate new scroll height and compare with last scroll height. new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height def scrape_all_posts(): MAP = { 'gov-001': 'https://forum.makerdao.com/tag/gov-001', 'sf-001': 'https://forum.makerdao.com/tag/sf-001', 'ces-001': 'https://forum.makerdao.com/tag/ces-001', 'sas-001': 'https://forum.makerdao.com/tag/sas-001', 'risk-001': 'https://forum.makerdao.com/tag/risk-001', 'tech-001': 'https://forum.makerdao.com/tag/tech-001', # 'sh-001': 'https://forum.makerdao.com/tag/sh-001', 'gro-001': 'https://forum.makerdao.com/tag/gro-001', # 'rwf-001': 'https://forum.makerdao.com/tag/rwf-001', 'com-001': 'https://forum.makerdao.com/tag/com-001', 'dux-001': 'https://forum.makerdao.com/tag/dux-001', 'sne-001': 'https://forum.makerdao.com/tag/sne-001', 'deco-001': 'https://forum.makerdao.com/tag/deco-001', # 'events-001': 'https://forum.makerdao.com/tag/events-001', 'pe-001': 'https://forum.makerdao.com/tag/pe-001', 'ses-001': 'https://forum.makerdao.com/tag/ses-001', 'is-001': 'https://forum.makerdao.com/tag/is-001', 'daif-001': 'https://forum.makerdao.com/tag/daif-001', 'din-001': 'https://forum.makerdao.com/tag/din-001', 'ora-001': 'https://forum.makerdao.com/tag/ora-001' } for doc in MAP.keys(): url = MAP[doc] # setting driver options options = webdriver.FirefoxOptions() options.add_argument('--ignore-certificate-errors') options.add_argument('--incognito') options.add_argument('--headless') # driver instance driver = webdriver.Firefox(options=options) driver.get(url) # infinite scroll scrolling(driver) page_source = driver.page_source # raw html page = page_source.encode('utf-8') # encoding for soup soup = BeautifulSoup(page, 'html.parser') links = [] data = [] # grabbing top and bottom line post/tag links top_line = [a['href'] for a in soup.findAll( 'a', class_='title') if a.stripped_strings] # first page post links for line in top_line: links.append(line) for alldata in soup.findAll(re.compile("td")): raw = alldata.text.split(',') cc = list(map(lambda a: a.strip(), raw)) cl = ' '.join(cc).splitlines() new_list = [e for e in cl if e.strip()] for i in new_list: data.append(i.split(',')) dates = [] # get js dates dts = driver.find_elements(By.XPATH, '//*/tr[*]/td[5]') for date in dts: # assign newline to separator char dates.append(date.get_attribute('title').replace('\n', ' | ')) clean_dates = [] # strip unwated text for date in dates: sep = '|' # remove everthing after separators sep2 = ':' dt = date.split(sep, 1)[0] # strip last post time dtstr = dt.replace('First post: ', '') dtt = dtstr.split(sep2, 1)[0] # strip h:mm, unwanted end chars rinsed = dtt.rstrip(dtt[-1]) # strip last char final_dts = rinsed.rstrip('1') # strip leftovers final_dts = final_dts.rstrip(' ') clean_dates.append(final_dts.splitlines()) # close driver instance driver.close() if True: print("Driver closed.") else: print('error.') # start:stop:step to splice data titles = data[0::6] mainTag = data[1::6] subTags = data[2::6] replies = data[3::6] views = data[4::6] dates = data[5::6] links = links[0:] # cleaning columns sub_tags_final = [] for item in subTags: i = ' '.join(item) sub_tags_final.append(i.split()) df = pd.DataFrame({ 'titles': titles, 'mainTag': mainTag, 'subTags': sub_tags_final, 'replies': replies, 'views': views, 'links': links }) # cast to string to strip unwanted bracket elements df['titles'] = df['titles'].astype(str).str.strip('[]') df['mainTag'] = df['mainTag'].astype(str).str.strip('[]') df['subTags'] = df['subTags'].astype(str).str.strip('[]') df['replies'] = df['replies'].astype(str).str.strip('[]') df['views'] = df['views'].astype(str).str.strip('[]') df['links'] = df['links'].astype(str).str.strip('[]') # strip quotes df = df.replace("'", '', regex=True) # cast to proper dtypes, split dates df['replies'] = df['replies'].replace(r'[k]+$', '', regex=True).astype(float) * \ df['replies'].str.extract( r'[\d\.]+([k]+)', expand=False).fillna(1).replace(['k'], [10**3]).astype(int) df['views'] = df['views'].replace(r'[k]+$', '', regex=True).astype(float) * \ df['views'].str.extract( r'[\d\.]+([k]+)', expand=False).fillna(1).replace(['k'], [10**3]).astype(int) # subTags split and clean df['subTags'] = df['subTags'].astype(str).str.split(',') df['subTags'] = df['subTags'].astype(str).str.strip('[]') df['subTags'] = df['subTags'].replace("'", '', regex=True).replace(' ', ' ', regex=True) # date df to append to main df2 = pd.DataFrame({'dates': clean_dates}) # cleaning dates for framing dates = df2['dates'] dates = dates.astype(str).str.strip('[]') dates = dates.replace("'", '', regex=True).replace(' ', ' ', regex=True) # merge dfs merged_df = df.assign(Dates=dates) # match all index lengths with NaN if necessary if len(merged_df['dates']) < len(merged_df['titles']): merged_df['dates'] += (len(merged_df['titles']) - len(merged_df['dates'])) * ['Archived'] # print_full(merged_df) current_date = f'all_posts{datetime.datetime.now().strftime("%m.%d.%Y")}.csv' merged_df.to_csv(current_date, mode='a', encoding='utf-8') yield merged_df