#!/usr/bin/env python3 """archive-fandom-wiki Usage: afw -f [-w ] [-b ] afw -h Options: -f The fandom to archive. (Required) -w The maximum number of workers to use for concurrent threads. (Optional; Default is 4) -b The URL of the BreezeWiki instance to use. (Optional; Default is https://breezewiki.hyperreal.coffee) -h --help Show this help message. Examples: afw -f dishonored -w 16 -b https://breezewiki.hyperreal.coffee afw -f residentevil """ # This file is formatted with `black -l 79' to comply with PEP8 standards. # # urllib.urlopen is used instead of the requests library because I ran # into URL quoting issues when using requests that are not a problem when # using urllib.urlopen. import concurrent.futures import shutil import sys from docopt import docopt sys.tracebacklimit = 0 from datetime import datetime from pathlib import Path from urllib.error import HTTPError, URLError from urllib.parse import urljoin from urllib.request import urlopen, urlretrieve from bs4 import BeautifulSoup from rich.console import Console console = Console() class FandomWiki: def __init__(self, name: str): self.name = name self.canonical_name = f"{name}.fandom.com" self.canonical_url = f"https://{self.canonical_name}" self.breezewiki_url = "https://breezewiki.hyperreal.coffee" self.archive_rootdir = Path.cwd() self.site_dir = self.archive_rootdir.joinpath(f"{self.canonical_name}") self.images_dir = self.site_dir.joinpath("images") def set_breezewiki_url(self, breezewiki_url: str): self.breezewiki_url = breezewiki_url def get_hop0_urls(self) -> list: starting_url = f"{self.canonical_url}/wiki/Local_Sitemap" hop0_urls = list() while True: with urlopen(starting_url) as response: response_body = response.read() decoded_body = response_body.decode("utf-8") soup = BeautifulSoup(decoded_body, "html.parser") mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0] if ( len(mw_allpages_nav.find_all("a")) < 2 and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text() ): break else: if len(mw_allpages_nav.find_all("a")) < 2: starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}" else: starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}" hop0_urls.append(starting_url) console.log(starting_url) return hop0_urls def get_hop1_urls(self, hop0_urls: list): hop1_urls = [self.breezewiki_url] for url in hop0_urls: with urlopen(url) as response: response_body = response.read() decoded_body = response_body.decode("utf-8") soup = BeautifulSoup(decoded_body, "html.parser") for item in soup.find_all("a"): if item.get("href") and item.get("href").startswith("/wiki"): if "Local_Sitemap" not in item.get( "href" ) and "Special:" not in item.get("href"): new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}" hop1_urls.append(new_url) console.log(new_url) return hop1_urls def save_css(self): with urlopen(self.breezewiki_url) as response: response_body = response.read() decoded_body = response_body.decode("utf-8") soup = BeautifulSoup(response_body, "html.parser") css_pages = list() for css in soup.find_all("link"): if css.attrs.get("href") and ".css" in css.attrs.get("href"): css_url = urljoin(self.breezewiki_url, css.attrs.get("href")) css_pages.append(css_url) for page in css_pages: with urlopen(page) as response: response_body = response.read() decoded_body = response_body.decode("utf-8") css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}") with open(css_filename, "w") as outfile: outfile.write(decoded_body) console.log(css_filename) def save_img(self, img_url: str): filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name) if not filename.exists(): urlretrieve(img_url, filename) console.log(filename) def fetch_all_images(self, page_url: str): with urlopen(page_url) as response: response_body = response.read() decoded_body = response_body.decode("utf-8") soup = BeautifulSoup(decoded_body, "html.parser") img_tags = soup.find_all("img") img_urls = [img["src"] for img in img_tags if "src" in img.attrs] clean_img_urls = [ x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x ] for img_url in clean_img_urls: self.save_img(img_url) def save_page(self, url: str): filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html") if not filename.exists(): with urlopen(url) as response: response_body = response.read() decoded_body = response_body.decode("utf-8") soup = BeautifulSoup(decoded_body, "html.parser") stylesheet_count = 0 for link in soup.find_all("link", {"rel": "stylesheet"}): stylesheet_count += 1 link.decompose() for i in range(stylesheet_count): if soup.head: soup.head.append( soup.new_tag( "link", rel="stylesheet", type="text/css", href=f"proxy{i}.css", ) ) self.fetch_all_images(url) soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore for link in soup.find_all("a"): if link.get("href") and link.get("href").startswith( f"/{self.name}/wiki" ): link_basename = link.get("href").partition("/wiki/")[2] link["href"] = f"{self.site_dir}/{link_basename}.html" with open(filename, "w") as outfile: outfile.write(soup.prettify()) console.log(filename) def fetch_all_pages(self, hop1_urls: list): max_workers = int(args["-w"]) if args["-w"] else 4 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: executor.map(self.save_page, hop1_urls) def archive_site(self): try: with urlopen(self.canonical_url) as response: response.read() except HTTPError as http_err: console.print("Oops. Something went wrong. Likely one of the following:\n") console.print("(1) The wiki you requested does not exist.") console.print("(2) You mistyped the name of the wiki.") console.print("(3) The server hosting that wiki is down for some reason.\n") console.print(f"HTTP Error: {http_err}") except URLError as url_err: console.print(f"URL Error: {url_err}") with console.status("Fetching hop 0 URLs...", spinner="aesthetic"): hop0_urls = self.get_hop0_urls() with console.status("Fetching hop 1 URLs...", spinner="aesthetic"): hop1_urls = self.get_hop1_urls(hop0_urls) # Creates the parent dirs: self.archive_rootdir > self.site_dir > self.images_dir self.images_dir.mkdir(parents=True) with console.status("Saving CSS files...", spinner="aesthetic"): self.save_css() with console.status( "Downloading web pages and/or images...", spinner="aesthetic" ): self.fetch_all_pages(hop1_urls) total_web_files = sum(1 for x in self.site_dir.iterdir() if x.is_file()) total_image_files = sum(1 for x in self.images_dir.iterdir() if x.is_file()) timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") with console.status("tar.gzipping downloaded files...", spinner="aesthetic"): shutil.make_archive( f"{self.name}-{timestamp}", "gztar", root_dir=self.archive_rootdir, base_dir=self.canonical_name, ) with console.status("zipping downloaded files...", spinner="aesthetic"): shutil.make_archive( f"{self.name}-{timestamp}", "zip", root_dir=self.archive_rootdir, base_dir=self.canonical_name, ) shutil.rmtree(self.site_dir) console.log(f"\nTotal web files archived: {total_web_files}") console.log(f"Total images archived: {total_image_files}") if __name__ == "__main__": args = docopt(__doc__, options_first=True, help=True, version="1.0.1") # type: ignore site = FandomWiki(args["-f"]) if args["-b"]: site.set_breezewiki_url(args["-b"]) site.archive_site()