archive-fandom-wiki/afw.py

#!/usr/bin/env python3

"""archive-fandom-wiki

Usage:
    afw -f <fandom> [-w <max_workers>] [-b <breezewiki_url>]
    afw -h

Options:
    -f <fandom>             The fandom to archive. (Required)
    -w <max_workers>        The maximum number of workers to use for concurrent threads. (Optional; Default is 4)
    -b <breezewiki_url>     The URL of the BreezeWiki instance to use. (Optional; Default is https://breezewiki.hyperreal.coffee)
    -h --help               Show this help message.

Examples:
    afw -f dishonored -w 16 -b https://breezewiki.hyperreal.coffee
    afw -f residentevil
"""

# This file is formatted with `black -l 79' to comply with PEP8 standards.
#
# urllib.urlopen is used instead of the requests library because I ran
# into URL quoting issues when using requests that are not a problem when
# using urllib.urlopen.

import concurrent.futures
import shutil
import sys

from docopt import docopt

sys.tracebacklimit = 0
from datetime import datetime
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import urlopen, urlretrieve

from bs4 import BeautifulSoup
from rich.console import Console

console = Console()


class FandomWiki:
    def __init__(self, name: str):
        self.name = name
        self.canonical_name = f"{name}.fandom.com"
        self.canonical_url = f"https://{self.canonical_name}"
        self.breezewiki_url = "https://breezewiki.hyperreal.coffee"
        self.archive_rootdir = Path.cwd()
        self.site_dir = self.archive_rootdir.joinpath(f"{self.canonical_name}")
        self.images_dir = self.site_dir.joinpath("images")

    def set_breezewiki_url(self, breezewiki_url: str):
        self.breezewiki_url = breezewiki_url

    def get_hop0_urls(self) -> list:
        starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"
        hop0_urls = list()

        while True:
            with urlopen(starting_url) as response:
                response_body = response.read()
            decoded_body = response_body.decode("utf-8")
            soup = BeautifulSoup(decoded_body, "html.parser")
            mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]

            if (
                len(mw_allpages_nav.find_all("a")) < 2
                and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
            ):
                break
            else:
                if len(mw_allpages_nav.find_all("a")) < 2:
                    starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"
                else:
                    starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"

                hop0_urls.append(starting_url)
                console.log(starting_url)

        return hop0_urls

    def get_hop1_urls(self, hop0_urls: list):
        hop1_urls = [self.breezewiki_url]

        for url in hop0_urls:
            with urlopen(url) as response:
                response_body = response.read()
            decoded_body = response_body.decode("utf-8")
            soup = BeautifulSoup(decoded_body, "html.parser")

            for item in soup.find_all("a"):
                if item.get("href") and item.get("href").startswith("/wiki"):
                    if "Local_Sitemap" not in item.get(
                        "href"
                    ) and "Special:" not in item.get("href"):
                        new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}"
                        hop1_urls.append(new_url)
                        console.log(new_url)

        return hop1_urls

    def save_css(self):
        with urlopen(self.breezewiki_url) as response:
            response_body = response.read()
        decoded_body = response_body.decode("utf-8")
        soup = BeautifulSoup(response_body, "html.parser")
        css_pages = list()

        for css in soup.find_all("link"):
            if css.attrs.get("href") and ".css" in css.attrs.get("href"):
                css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))
                css_pages.append(css_url)

        for page in css_pages:
            with urlopen(page) as response:
                response_body = response.read()
            decoded_body = response_body.decode("utf-8")

            css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")
            with open(css_filename, "w") as outfile:
                outfile.write(decoded_body)

            console.log(css_filename)

    def save_img(self, img_url: str):
        filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
        if not filename.exists():
            urlretrieve(img_url, filename)
            console.log(filename)

    def fetch_all_images(self, page_url: str):
        with urlopen(page_url) as response:
            response_body = response.read()
        decoded_body = response_body.decode("utf-8")
        soup = BeautifulSoup(decoded_body, "html.parser")

        img_tags = soup.find_all("img")
        img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
        clean_img_urls = [
            x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x
        ]

        for img_url in clean_img_urls:
            self.save_img(img_url)

    def save_page(self, url: str):
        filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")
        if not filename.exists():
            with urlopen(url) as response:
                response_body = response.read()
            decoded_body = response_body.decode("utf-8")
            soup = BeautifulSoup(decoded_body, "html.parser")

            stylesheet_count = 0
            for link in soup.find_all("link", {"rel": "stylesheet"}):
                stylesheet_count += 1
                link.decompose()

                for i in range(stylesheet_count):
                    if soup.head:
                        soup.head.append(
                            soup.new_tag(
                                "link",
                                rel="stylesheet",
                                type="text/css",
                                href=f"proxy{i}.css",
                            )
                        )

            self.fetch_all_images(url)

            soup.find("div", {"class": "bw-top-banner"}).extract()  # type: ignore

            for link in soup.find_all("a"):
                if link.get("href") and link.get("href").startswith(
                    f"/{self.name}/wiki"
                ):
                    link_basename = link.get("href").partition("/wiki/")[2]
                    link["href"] = f"{self.site_dir}/{link_basename}.html"

            with open(filename, "w") as outfile:
                outfile.write(soup.prettify())

            console.log(filename)

    def fetch_all_pages(self, hop1_urls: list):
        max_workers = int(args["-w"]) if args["-w"] else 4
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            executor.map(self.save_page, hop1_urls)

    def archive_site(self):
        try:
            with urlopen(self.canonical_url) as response:
                response.read()
        except HTTPError as http_err:
            console.print("Oops. Something went wrong. Likely one of the following:\n")
            console.print("(1) The wiki you requested does not exist.")
            console.print("(2) You mistyped the name of the wiki.")
            console.print("(3) The server hosting that wiki is down for some reason.\n")
            console.print(f"HTTP Error: {http_err}")
        except URLError as url_err:
            console.print(f"URL Error: {url_err}")

        with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
            hop0_urls = self.get_hop0_urls()

        with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):
            hop1_urls = self.get_hop1_urls(hop0_urls)

        # Creates the parent dirs: self.archive_rootdir > self.site_dir > self.images_dir
        self.images_dir.mkdir(parents=True)

        with console.status("Saving CSS files...", spinner="aesthetic"):
            self.save_css()

        with console.status(
            "Downloading web pages and/or images...", spinner="aesthetic"
        ):
            self.fetch_all_pages(hop1_urls)

        total_web_files = sum(1 for x in self.site_dir.iterdir() if x.is_file())
        total_image_files = sum(1 for x in self.images_dir.iterdir() if x.is_file())

        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

        with console.status("tar.gzipping downloaded files...", spinner="aesthetic"):
            shutil.make_archive(
                f"{self.name}-{timestamp}",
                "gztar",
                root_dir=self.archive_rootdir,
                base_dir=self.canonical_name,
            )

        with console.status("zipping downloaded files...", spinner="aesthetic"):
            shutil.make_archive(
                f"{self.name}-{timestamp}",
                "zip",
                root_dir=self.archive_rootdir,
                base_dir=self.canonical_name,
            )

        shutil.rmtree(self.site_dir)

        console.log(f"\nTotal web files archived: {total_web_files}")
        console.log(f"Total images archived: {total_image_files}")


if __name__ == "__main__":
    args = docopt(__doc__, options_first=True, help=True, version="1.0.1")  # type: ignore

    site = FandomWiki(args["-f"])
    if args["-b"]:
        site.set_breezewiki_url(args["-b"])

    site.archive_site()