# python爬蟲
```
#取得隨機網址
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re,datetime,random
pages = set()
random.seed(datetime.datetime.now())
# Retrieves a list of all Internal links found on a page
def getILs(bs, iUrl):
iUrl = f'{urlparse(iUrl).scheme}://{urlparse(iUrl).netloc}'
iLs = []
# Finds all links that begin with a "/"
for link in bs.find_all('a', {'href': re.compile(f'^(/|.*{iUrl})')} ):
a = link.attrs['href']
if a and a not in iLs:
if a.startswith('/'): iLs.append(iUrl+a)
else: iLs.append(a)
return iLs
# Retrieves a list of all external links found on a page
def gELs(bs, eUrl):
eLs = []
# Finds all links that start with "http" that do
# not contain the current URL
for link in bs.find_all('a', {'href': re.compile(f'^(http|www)((?!{eUrl}).)*$')} ):
a = link.attrs['href']
if a and a not in eLs: eLs.append(a)
return eLs
def getRL(sP):
html = urlopen(sP)
bs = BeautifulSoup(html, 'lxml')
eLs = gELs(bs, urlparse(sP).netloc)
if not len(eLs):
print('No external links, looking around the site for one')
domain = f'{urlparse(sP).scheme}://{urlparse(sP).netloc}'
iLs = getILs(bs, domain)
return getRL(iLs[random.randint(0, len(iLs)-1)])
else: return eLs[random.randint(0, len(eLs)-1)]
def fEO(startingSite):
eL = getRL(startingSite)
print(f'Random external link is: {eL}')
fEO(eL)
fEO('http://oreilly.com')
```