Any ideas on making this python web crawler code better, simpler, or optimal?

samlee916 · (This post was last modified: Jul-15-2020, 01:30 PM by samlee916.)

import requests
import argparse
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup

#initialize the set of links
internal_links = set()
external_links = set()

total_links_visited = 0

def is_valid(url):

    #checks whether url is a valid URL
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url):
  
    #returns all URLs that is found on url in which it belongs to the same website
    #all URLs of url
    urls = set()
    #domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href_tag = a_tag.attrs.get("href")
        if not href_tag:
            #href empty tag
            continue
        #join the URL if it's relative
        href_tag = urljoin(url, href_tag)
        parsed_href = urlparse(href_tag)
        #remove URL GET parameters, URL fragments, etc.
        href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href_tag):
            #not a valid URL
            continue 
        if href_tag in internal_links:
            #already in the set
            continue
        if domain_name not in href_tag:
            #external link
            if href_tag not in external_links:
                print(f"External link: {href_tag}")
                external_links.add(href_tag)
            continue
        print(f"Internal link: {href_tag}")
        urls.add(href_tag)
        internal_links.add(href_tag)
    return urls

def crawl(url, max_urls=50):
    #crawls a web page and extracts all links
    global total_links_visited
    total_links_visited += 1
    links = get_all_website_links(url)
    for link in links:
        if total_links_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
    parser.add_argument("url", help="The URL to extract links from.")
    parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
    
    args = parser.parse_args()
    url = args.url
    max_urls = args.max_urls

    crawl(url, max_urls=max_urls)

    print("Total Internal Links:", len(internal_links))
    print("Total External Links:", len(external_links))
    print("Total URLs:", len(external_links) + len(internal_links))

    domain_name = urlparse(url).netloc

    # save the internal links to a file
    with open(f"{domain_name}_internal_links.txt", "w") as f:
        for internal_link in internal_links:
            print(internal_link.strip(), file=f)

    # save the external links to a file
    with open(f"{domain_name}_external_links.txt", "w") as f:
        for external_link in external_links:
            print(external_link.strip(), file=f)

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Any ideas on making this python web crawler code more secure and optimal?	samlee916	0	1,518	Jul-21-2020, 03:47 AM Last Post: samlee916

Any ideas on making this python web crawler code better, simpler, or optimal?

User Panel Messages

Announcements