The code is a bit confusing to me but let's stick to errors for a start. The goal is to start from some page and then find external links following one by one:
but then get this error:
import requests from bs4 import BeautifulSoup import re import datetime import random pages = set() random.seed(datetime.datetime.now()) # Retrieves a list of all Internal links found on a page def getInternalLinks(bsObj, includeUrl): includeUrl = requests.utils.urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc internalLinks = [] # finds all links that begin with "/" for link in bsObj.find_all("a", href=re.compile("^(/|.*"+includeUrl+")")): if link.attrs['href'] is not None: if link.attrs['href'] not in internalLinks: if(link.attrs['href'].startswith("/")): internalLinks.append(includeUrl+link.attrs['href']) else: internalLinks.append(link.attrs['href']) return internalLinks # Retrieves a list of all external links found on a page def getExternalLinks(bsObj, excludeUrl): externalLinks = [] # finds all links that start with "http" that do not contain the current URL for link in bsObj.find_all("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")): if link.attrs['href'] not in externalLinks: externalLinks.append(link.attrs['href']) return externalLinks def getRandomExternalLink(startingPage): html = requests.get(startingPage) bsObj = BeautifulSoup(html.content, 'html.parser') externalLinks = getExternalLinks(bsObj, requests.utils.urlparse(startingPage).netloc) if len(externalLinks) == 0: print("No external links, looking around the site for one") domain = requests.utils.urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc internalLinks = getInternalLinks(bsObj, domain) return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)]) else: return externalLinks[random.randint(0, len(externalLinks)-1)] def followExternalOnly(startingSite): externalLink = getRandomExternalLink(startingSite) print("Random external link is: "+externalLink) followExternalOnly(externalLink) followExternalOnly("http://oreilly.com")error:
Error:Random external link is: https://www.safaribooksonline.com/public/free-trial/
Traceback (most recent call last):
File "C:\Python36\kodovi\crawler.py", line 52, in <module>
followExternalOnly("http://oreilly.com")
File "C:\Python36\kodovi\crawler.py", line 50, in followExternalOnly
followExternalOnly(externalLink)
File "C:\Python36\kodovi\crawler.py", line 48, in followExternalOnly
externalLink = getRandomExternalLink(startingSite)
File "C:\Python36\kodovi\crawler.py", line 39, in getRandomExternalLink
if len(externalLinks) == 0:
TypeError: object of type 'NoneType' has no len()
I tried to switch line 37 with if externalLinks is None
but then get this error:
Error:Random external link is: https://www.safaribooksonline.com/public/free-trial/
No external links, looking around the site for one
Traceback (most recent call last):
File "C:\Python36\kodovi\crawler.py", line 52, in <module>
followExternalOnly("http://oreilly.com")
File "C:\Python36\kodovi\crawler.py", line 50, in followExternalOnly
followExternalOnly(externalLink)
File "C:\Python36\kodovi\crawler.py", line 48, in followExternalOnly
externalLink = getRandomExternalLink(startingSite)
File "C:\Python36\kodovi\crawler.py", line 41, in getRandomExternalLink
domain = requests.utils.urlparse(startingPage).scheme+"://"+urlparse(startin
gPage).netloc
NameError: name 'urlparse' is not defined
Not sure how urlparse is not defined when it's part of requests modul. If you have any suggestion how to improve this code I'll be glad to hear it. Also, have an impression that it's too complicated but I'm only following a book.