#!/usr/bin/env python """archive-fandom-wiki Usage: afw afw Options: -h --help Show this help message. -v --version Show version. Examples: afw dishonored https://breezewiki.nirn.quest afw residentevil """ # This file is formatted with `black -l 79' to comply with PEP8 standards. import concurrent.futures import shutil import sys from docopt import docopt sys.tracebacklimit = 0 import tarfile from datetime import datetime from pathlib import Path from urllib.parse import urljoin import requests from bs4 import BeautifulSoup from rich.console import Console from rich.progress import Progress console = Console() class FandomWiki: def __init__(self, name: str, breezewiki_url: str): self.name = name self.canonical_url = f"https://{name}.fandom.com" self.breezewiki_url = breezewiki_url self.site_dir = Path.cwd().joinpath(f"{name}.fandom.com") self.images_dir = self.site_dir.joinpath("images") try: response = requests.get(self.canonical_url) response.raise_for_status() except requests.HTTPError as http_err: console.print("Oops. Something went wrong. Likely one of the following:\n") console.print("(1) The wiki you requested does not exist.") console.print("(2) You typed the name of the wiki incorrectly.") console.print("(3) The server hosting that wiki is down for some reason.\n") console.print(f"HTTP error: {http_err}") else: if not self.site_dir.exists(): self.site_dir.mkdir() if not self.images_dir.exists(): self.images_dir.mkdir() def get_hop0_urls(self) -> list: starting_url = f"{self.canonical_url}/wiki/Local_Sitemap" hop0_urls = list() while True: response = requests.get(starting_url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0] if ( len(mw_allpages_nav.find_all("a")) < 2 and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text() ): break else: if len(mw_allpages_nav.find_all("a")) < 2: starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}" else: starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}" hop0_urls.append(starting_url) console.log(starting_url) return hop0_urls def get_hop1_urls(self, hop0_urls: list): hop1_urls = [self.breezewiki_url] for url in hop0_urls: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") for item in soup.find_all("a"): if item.get("href") and item.get("href").startswith("/wiki"): if "Local_Sitemap" not in item.get( "href" ) and "Special:" not in item.get("href"): new_url = f"{self.breezewiki_url}{item.get('href')}" hop1_urls.append(new_url) console.log(new_url) return hop1_urls def save_css(self): response = requests.get(self.breezewiki_url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") css_pages = list() for css in soup.find_all("link"): if css.attrs.get("href") and ".css" in css.attrs.get("href"): css_url = urljoin(self.breezewiki_url, css.attrs.get("href")) css_pages.append(css_url) for page in css_pages: response = requests.get(page) response.raise_for_status() css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}") with open(css_filename, "wb") as outfile: outfile.write(response.content) console.log(css_filename) def save_img(self, img_url: str): filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name) if not filename.exists(): response = requests.get(img_url, stream=True) response.raise_for_status() with open(filename, "wb") as outfile: for chunk in response.iter_content(chunk_size=8192): outfile.write(chunk) console.log(filename) def fetch_all_images(self, page_url: str): response = requests.get(page_url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") img_tags = soup.find_all("img") img_urls = [img["src"] for img in img_tags if "src" in img.attrs] clean_img_urls = [ x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x ] for img_url in clean_img_urls: self.save_img(img_url) def save_page(self, url: str): filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html") if not filename.exists(): response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") stylesheet_count = 0 for link in soup.find_all("link", {"rel": "stylesheet"}): stylesheet_count += 1 link.decompose() for i in range(stylesheet_count): if soup.head: soup.head.append( soup.new_tag( "link", rel="stylesheet", type="text/css", href=f"proxy{i}.css", ) ) self.fetch_all_images(url) soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore for link in soup.find_all("a"): if link.get("href") and link.get("href").startswith( f"/{self.name}/wiki" ): link_basename = link.get("href").partition("/wiki/")[2] link["href"] = f"{self.site_dir}/{link_basename}.html" with open(filename, "w") as outfile: outfile.write(soup.prettify()) console.log(filename) def fetch_all_pages(self, hop1_urls: list): with concurrent.futures.ThreadPoolExecutor() as executor: executor.map(self.save_page, hop1_urls) def archive(self): timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") img_files = [ f for f in self.images_dir.iterdir() if self.images_dir.joinpath(f).is_file() ] img_archive_filename = f"{self.images_dir}-{timestamp}.tar.xz" with Progress() as progress: task = progress.add_task("[cyan]Archiving images...", total=len(img_files)) with tarfile.open(img_archive_filename, "w:xz") as tar: for img_file in img_files: if progress.finished: break full_file_path = self.images_dir.joinpath(img_file) tar.add(full_file_path, arcname=img_file) progress.update(task, advance=1) progress.stop() shutil.rmtree(self.images_dir, ignore_errors=True) web_files = [ f for f in self.site_dir.iterdir() if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir() ] web_archive_filename = f"{self.site_dir}-{timestamp}.tar.gz" with Progress() as progress: task = progress.add_task( "[cyan]Archiving web files...", total=len(web_files) ) with tarfile.open(web_archive_filename, "w:gz") as tar: for web_file in web_files: if progress.finished: break full_file_path = self.site_dir.joinpath(web_file) tar.add(full_file_path, arcname=web_file) progress.update(task, advance=1) progress.stop() shutil.rmtree(self.site_dir, ignore_errors=True) console.log(f"\nTotal web files scraped: {len(web_files)}") console.log(f"Total images scraped: {len(img_files)}") def archive_site(name: str, breezewiki_url: str = "https://breezewiki.nirn.quest"): site = FandomWiki(name, breezewiki_url) with console.status("Fetching hop 0 URLs...", spinner="aesthetic"): hop0_urls = site.get_hop0_urls() with console.status("Fetching hop 1 URLs...", spinner="aesthetic"): hop1_urls = site.get_hop1_urls(hop0_urls) with console.status("Saving CSS files...", spinner="aesthetic"): site.save_css() with console.status("Downloading images and web pages...", spinner="aesthetic"): site.fetch_all_pages(hop1_urls) site.archive() if __name__ == "__main__": args = docopt(__doc__, options_first=True, help=True, version="1.0.1") if args[""]: archive_site(args[""], args[""]) else: archive_site(args[""])