#!/usr/bin/env python # This program archives the content of fandom wikis. # # It's pretty much feature-complete. I still have to add detailed comments to # describe what each significant piece of code is doing. # # This program doesn't scrape from the fandom.com wiki sites directly; rather, # it uses my BreezeWiki instance to avoid downloading unneccessary ads, images, # and other junk. # # Each resulting archive is self-contained, meaning one can extract the # contents and browse the wiki snapshot locally (offline). The URLs for CSS, # images, and links in each page are replaced by the file:/// URLs for their # corresponding pages on the local filesystem. # # This file is formatted with `black -l 79' to comply with PEP8 standards. import concurrent.futures import shutil import sys import tarfile from datetime import datetime from pathlib import Path from urllib.parse import urljoin import requests from bs4 import BeautifulSoup from rich.console import Console from rich.progress import Progress from rich.tree import Tree console = Console() class FandomWiki: def __init__(self, name: str): self.name = name self.canonical_url = "https://{}.fandom.com".format(name) self.breezewiki_url = "https://wiki.hyperreal.coffee/{}".format(name) self.site_dir = Path.cwd().joinpath("{}.fandom.com".format(name)) self.images_dir = self.site_dir.joinpath("images") if not self.site_dir.exists(): self.site_dir.mkdir() if not self.images_dir.exists(): self.images_dir.mkdir() def get_hop0_urls(self) -> list: starting_url = "{}/wiki/Local_Sitemap".format(self.canonical_url) hop0_urls = list() while True: response = requests.get(starting_url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") mw_allpages_nav = soup.find_all( "div", {"class": "mw-allpages-nav"} )[0] if ( len(mw_allpages_nav.find_all("a")) < 2 and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text() ): break else: if len(mw_allpages_nav.find_all("a")) < 2: starting_url = "{}{}".format( self.canonical_url, mw_allpages_nav.find_all("a")[0].get("href"), ) else: starting_url = "{}{}".format( self.canonical_url, mw_allpages_nav.find_all("a")[1].get("href"), ) hop0_urls.append(starting_url) console.print("[[bold]HOP 0[/bold]] {}".format(starting_url)) return hop0_urls def get_hop1_urls(self, hop0_urls: list): hop1_urls = [self.breezewiki_url] for url in hop0_urls: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") for item in soup.find_all("a"): if item.get("href") and item.get("href").startswith("/wiki"): if "Local_Sitemap" not in item.get( "href" ) and "Special:" not in item.get("href"): new_url = "{}{}".format( self.breezewiki_url, item.get("href") ) hop1_urls.append(new_url) console.print( "[[bold]HOP 1[/bold]] {}".format(new_url) ) return hop1_urls def save_css(self): response = requests.get(self.breezewiki_url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") css_pages = list() for css in soup.find_all("link"): if css.attrs.get("href"): css_url = urljoin(self.breezewiki_url, css.attrs.get("href")) css_pages.append(css_url) for page in css_pages: response = requests.get(page) response.raise_for_status() css_filename = self.site_dir.joinpath( "proxy{}.css".format(css_pages.index(page)) ) with open(css_filename, "wb") as outfile: outfile.write(response.content) console.print( "[[bold green]CSS[/bold green]] {}".format(css_filename) ) def save_img(self, img_url: str): filename = self.images_dir.joinpath( Path(img_url.split("/revision")[0]).name ) if not filename.exists(): response = requests.get(img_url, stream=True) response.raise_for_status() with open(filename, "wb") as outfile: for chunk in response.iter_content(chunk_size=8192): outfile.write(chunk) console.print("[[bold green]IMG[/bold green]] {}".format(filename)) else: console.print( "[[bold yellow]IMG (EXISTS)[/bold yellow]] {}".format(filename) ) def fetch_all_images(self, page_url: str): response = requests.get(page_url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") img_tags = soup.find_all("img") img_urls = [img["src"] for img in img_tags if "src" in img.attrs] clean_img_urls = [ x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x ] for img_url in clean_img_urls: self.save_img(img_url) def save_page(self, url: str): filename = self.site_dir.joinpath("{}.html".format(url.split("/")[-1])) if not filename.exists(): response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") stylesheet_count = 0 for link in soup.find_all("link", {"rel": "stylesheet"}): stylesheet_count += 1 link.decompose() for i in range(stylesheet_count): if soup.head: soup.head.append( soup.new_tag( "link", rel="stylesheet", type="text/css", href="proxy{}.css".format(i), ) ) self.fetch_all_images(url) soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore for link in soup.find_all("a"): if link.get("href") and link.get("href").startswith( "/{}/wiki".format(self.name) ): link_basename = link.get("href").partition("/wiki/")[2] link["href"] = "{}/{}.html".format( self.site_dir, link_basename ) with open(filename, "w") as outfile: outfile.write(soup.prettify()) console.print( "[[bold green]HTML[/bold green]] {}".format(filename) ) else: console.print( "[[bold yellow]HTML (EXISTS)[/bold yellow]] {}".format( filename ) ) def fetch_all_pages(self, hop1_urls: list): self.save_css() with concurrent.futures.ThreadPoolExecutor() as executor: executor.map(self.save_page, hop1_urls) def archive(self): timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") img_files = [ f for f in self.images_dir.iterdir() if self.images_dir.joinpath(f).is_file() ] img_archive_filename = "{}-{}.tar.xz".format( self.images_dir, timestamp ) with Progress() as progress: task = progress.add_task( "[cyan]Archiving images...", total=len(img_files) ) with tarfile.open(img_archive_filename, "w:xz") as tar: for img_file in img_files: if progress.finished: break full_file_path = self.images_dir.joinpath(img_file) tar.add(full_file_path, arcname=img_file) progress.update(task, advance=1) progress.stop() shutil.rmtree(self.images_dir, ignore_errors=True) web_files = [ f for f in self.site_dir.iterdir() if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir() ] web_archive_filename = "{}-{}.tar.gz".format(self.site_dir, timestamp) with Progress() as progress: task = progress.add_task( "[cyan]Archiving web files...", total=len(web_files) ) with tarfile.open(web_archive_filename, "w:gz") as tar: for web_file in web_files: if progress.finished: break full_file_path = self.site_dir.joinpath(web_file) tar.add(full_file_path, arcname=web_file) progress.update(task, advance=1) progress.stop() shutil.rmtree(self.site_dir, ignore_errors=True) console.print("\nTotal web files scraped: {}".format(len(web_files))) console.print("Total images scraped: {}".format(len(img_files))) def archive_site(name: str): site = FandomWiki(name) site.fetch_all_pages(site.get_hop1_urls(site.get_hop0_urls())) site.archive() def usage_message(): supported_wikis = [ "cyberpunk", "dishonored", "dragonage", "forgottenrealms", "masseffect", "residentevil", ] wiki_tree = Tree("[green]Fandom Wikis") for wiki in supported_wikis: wiki_tree.add(wiki) console.print("Usage:\n\tarchive-fandom-wiki [[italic]name[/italic]]\n") console.print("Example:\n\tarchive-fandom-wiki dishonored\n") console.print(wiki_tree) if __name__ == "__main__": if len(sys.argv) > 1: match sys.argv[1]: case "cyberpunk": archive_site("cyberpunk") case "dishonored": archive_site("dishonored") case "dragonage": archive_site("dragonage") case "forgottenrealms": archive_site("forgottenrealms") case "masseffect": archive_site("masseffect") case "residentevil": archive_site("residentevil") case _: usage_message() else: usage_message()