python爬蟲 - HackMD

# python爬蟲 ``` #取得隨機網址 from urllib.request import urlopen from urllib.parse import urlparse from bs4 import BeautifulSoup import re,datetime,random pages = set() random.seed(datetime.datetime.now()) # Retrieves a list of all Internal links found on a page def getILs(bs, iUrl): iUrl = f'{urlparse(iUrl).scheme}://{urlparse(iUrl).netloc}' iLs = [] # Finds all links that begin with a "/" for link in bs.find_all('a', {'href': re.compile(f'^(/|.*{iUrl})')} ): a = link.attrs['href'] if a and a not in iLs: if a.startswith('/'): iLs.append(iUrl+a) else: iLs.append(a) return iLs # Retrieves a list of all external links found on a page def gELs(bs, eUrl): eLs = [] # Finds all links that start with "http" that do # not contain the current URL for link in bs.find_all('a', {'href': re.compile(f'^(http|www)((?!{eUrl}).)*$')} ): a = link.attrs['href'] if a and a not in eLs: eLs.append(a) return eLs def getRL(sP): html = urlopen(sP) bs = BeautifulSoup(html, 'lxml') eLs = gELs(bs, urlparse(sP).netloc) if not len(eLs): print('No external links, looking around the site for one') domain = f'{urlparse(sP).scheme}://{urlparse(sP).netloc}' iLs = getILs(bs, domain) return getRL(iLs[random.randint(0, len(iLs)-1)]) else: return eLs[random.randint(0, len(eLs)-1)] def fEO(startingSite): eL = getRL(startingSite) print(f'Random external link is: {eL}') fEO(eL) fEO('http://oreilly.com') ```