archive-fandom-wiki/afw.py

#!/usr/bin/env python3

"""archive-fandom-wiki

Usage:
    afw <fandom> [<breezewiki_instance>]
    afw -h

Options:
    -h --help       Show this help message.

Examples:
    afw dishonored https://breezewiki.hyperreal.coffee
    afw residentevil
"""

# This file is formatted with `black -l 79' to comply with PEP8 standards.
#
# urllib.urlopen is used instead of the requests library because I ran
# into URL quoting issues when using requests that are not a problem when
# using urllib.urlopen.

import concurrent.futures
import shutil
import sys

from docopt import docopt

sys.tracebacklimit = 0
from datetime import datetime
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import urlopen, urlretrieve

from bs4 import BeautifulSoup
from rich.console import Console

console = Console()


class FandomWiki:
    def __init__(self, name: str):
        self.name = name
        self.canonical_url = f"https://{name}.fandom.com"
        self.breezewiki_url = "https://breezewiki.hyperreal.coffee"
        self.site_dir = Path(f"{name}.fandom.com")
        self.images_dir = self.site_dir.joinpath("images")

    def set_breezewiki_url(self, breezewiki_url: str):
        self.breezewiki_url = breezewiki_url

    def get_hop0_urls(self) -> list:
        starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"
        hop0_urls = list()

        while True:
            with urlopen(starting_url) as response:
                response_body = response.read()
            decoded_body = response_body.decode("utf-8")
            soup = BeautifulSoup(decoded_body, "html.parser")
            mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]

            if (
                len(mw_allpages_nav.find_all("a")) < 2
                and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
            ):
                break
            else:
                if len(mw_allpages_nav.find_all("a")) < 2:
                    starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"
                else:
                    starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"

                hop0_urls.append(starting_url)
                console.log(starting_url)

        return hop0_urls

    def get_hop1_urls(self, hop0_urls: list):
        hop1_urls = [self.breezewiki_url]

        for url in hop0_urls:
            with urlopen(url) as response:
                response_body = response.read()
            decoded_body = response_body.decode("utf-8")
            soup = BeautifulSoup(decoded_body, "html.parser")

            for item in soup.find_all("a"):
                if item.get("href") and item.get("href").startswith("/wiki"):
                    if "Local_Sitemap" not in item.get(
                        "href"
                    ) and "Special:" not in item.get("href"):
                        new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}"
                        hop1_urls.append(new_url)
                        console.log(new_url)

        return hop1_urls

    def save_css(self):
        with urlopen(self.breezewiki_url) as response:
            response_body = response.read()
        decoded_body = response_body.decode("utf-8")
        soup = BeautifulSoup(response_body, "html.parser")
        css_pages = list()

        for css in soup.find_all("link"):
            if css.attrs.get("href") and ".css" in css.attrs.get("href"):
                css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))
                css_pages.append(css_url)

        for page in css_pages:
            with urlopen(page) as response:
                response_body = response.read()
            decoded_body = response_body.decode("utf-8")

            css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")
            with open(css_filename, "w") as outfile:
                outfile.write(decoded_body)

            console.log(css_filename)

    def save_img(self, img_url: str):
        filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
        if not filename.exists():
            urlretrieve(img_url, filename)
            console.log(filename)

    def fetch_all_images(self, page_url: str):
        with urlopen(page_url) as response:
            response_body = response.read()
        decoded_body = response_body.decode("utf-8")
        soup = BeautifulSoup(decoded_body, "html.parser")

        img_tags = soup.find_all("img")
        img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
        clean_img_urls = [
            x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x
        ]

        for img_url in clean_img_urls:
            self.save_img(img_url)

    def save_page(self, url: str):
        filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")
        if not filename.exists():
            with urlopen(url) as response:
                response_body = response.read()
            decoded_body = response_body.decode("utf-8")
            soup = BeautifulSoup(decoded_body, "html.parser")

            stylesheet_count = 0
            for link in soup.find_all("link", {"rel": "stylesheet"}):
                stylesheet_count += 1
                link.decompose()

                for i in range(stylesheet_count):
                    if soup.head:
                        soup.head.append(
                            soup.new_tag(
                                "link",
                                rel="stylesheet",
                                type="text/css",
                                href=f"proxy{i}.css",
                            )
                        )

            self.fetch_all_images(url)

            soup.find("div", {"class": "bw-top-banner"}).extract()  # type: ignore

            for link in soup.find_all("a"):
                if link.get("href") and link.get("href").startswith(
                    f"/{self.name}/wiki"
                ):
                    link_basename = link.get("href").partition("/wiki/")[2]
                    link["href"] = f"{self.site_dir}/{link_basename}.html"

            with open(filename, "w") as outfile:
                outfile.write(soup.prettify())

            console.log(filename)

    def fetch_all_pages(self, hop1_urls: list):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            executor.map(self.save_page, hop1_urls)

    def archive_site(self):
        try:
            with urlopen(self.canonical_url) as response:
                response.read()
        except HTTPError as http_err:
            console.print("Oops. Something went wrong. Likely one of the following:\n")
            console.print("(1) The wiki you requested does not exist.")
            console.print("(2) You mistyped the name of the wiki.")
            console.print("(3) The server hosting that wiki is down for some reason.\n")
            console.print(f"HTTP Error: {http_err}")
        except URLError as url_err:
            console.print(f"URL Error: {url_err}")

        with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
            hop0_urls = self.get_hop0_urls()

        with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):
            hop1_urls = self.get_hop1_urls(hop0_urls)

        self.site_dir.mkdir()
        self.images_dir.mkdir()

        with console.status("Saving CSS files...", spinner="aesthetic"):
            self.save_css()

        with console.status(
            "Downloading web pages and/or images...", spinner="aesthetic"
        ):
            self.fetch_all_pages(hop1_urls)

        total_web_files = sum(1 for x in self.site_dir.iterdir() if x.is_file())
        total_image_files = sum(1 for x in self.images_dir.iterdir() if x.is_file())

        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

        with console.status("Archiving images...", spinner="aesthetic"):
            shutil.make_archive(f"images-{timestamp}", "xztar", self.images_dir)
            shutil.rmtree(self.images_dir)
            shutil.move(f"images-{timestamp}.tar.xz", self.site_dir)

        with console.status("Archiving web files...", spinner="aesthetic"):
            shutil.make_archive(f"{self.name}-{timestamp}", "gztar", self.site_dir)

        shutil.rmtree(self.site_dir)

        console.log(f"\nTotal web files archived: {total_web_files}")
        console.log(f"Total images archived: {total_image_files}")


if __name__ == "__main__":
    args = docopt(__doc__, options_first=True, help=True, version="1.0.1")

    site = FandomWiki(args["<fandom>"])
    if args["<breezewiki_instance>"]:
        site.set_breezewiki_url(args["<breezewiki_instance>"])

    site.archive_site()
Refactor 2024-07-27 18:39:12 +02:00			`#!/usr/bin/env python3`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`"""archive-fandom-wiki`

			`Usage:`
Refactor 2024-07-18 09:40:18 +02:00			`afw <fandom> [<breezewiki_instance>]`
			`afw -h`
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00
			`Options:`
			`-h --help Show this help message.`

			`Examples:`
Refactor 2024-07-27 18:39:12 +02:00			`afw dishonored https://breezewiki.hyperreal.coffee`
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`afw residentevil`
			`"""`

Initial commit or whatever 2023-09-15 03:36:56 +02:00			# This file is formatted with `black -l 79' to comply with PEP8 standards.
Refactor 2024-07-18 09:40:18 +02:00			`#`
			`# urllib.urlopen is used instead of the requests library because I ran`
			`# into URL quoting issues when using requests that are not a problem when`
			`# using urllib.urlopen.`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`import concurrent.futures`
			`import shutil`
			`import sys`
feat: support all wikis on fandom.com 2023-09-16 00:48:20 +02:00
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`from docopt import docopt`

feat: support all wikis on fandom.com 2023-09-16 00:48:20 +02:00			`sys.tracebacklimit = 0`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`from datetime import datetime`
			`from pathlib import Path`
Refactor 2024-07-18 09:40:18 +02:00			`from urllib.error import HTTPError, URLError`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`from urllib.parse import urljoin`
Refactor 2024-07-18 09:40:18 +02:00			`from urllib.request import urlopen, urlretrieve`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`from bs4 import BeautifulSoup`
			`from rich.console import Console`

			`console = Console()`


			`class FandomWiki:`
Refactor 2024-07-18 09:40:18 +02:00			`def __init__(self, name: str):`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`self.name = name`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`self.canonical_url = f"https://{name}.fandom.com"`
Refactor 2024-07-27 18:39:12 +02:00			`self.breezewiki_url = "https://breezewiki.hyperreal.coffee"`
Fix various bugs 2024-07-16 08:59:21 +02:00			`self.site_dir = Path(f"{name}.fandom.com")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`self.images_dir = self.site_dir.joinpath("images")`

Refactor 2024-07-18 09:40:18 +02:00			`def set_breezewiki_url(self, breezewiki_url: str):`
			`self.breezewiki_url = breezewiki_url`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`def get_hop0_urls(self) -> list:`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`hop0_urls = list()`

			`while True:`
Refactor 2024-07-18 09:40:18 +02:00			`with urlopen(starting_url) as response:`
			`response_body = response.read()`
			`decoded_body = response_body.decode("utf-8")`
			`soup = BeautifulSoup(decoded_body, "html.parser")`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`if (`
			`len(mw_allpages_nav.find_all("a")) < 2`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`):`
			`break`
			`else:`
			`if len(mw_allpages_nav.find_all("a")) < 2:`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`else:`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`hop0_urls.append(starting_url)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(starting_url)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`return hop0_urls`

			`def get_hop1_urls(self, hop0_urls: list):`
			`hop1_urls = [self.breezewiki_url]`

			`for url in hop0_urls:`
Refactor 2024-07-18 09:40:18 +02:00			`with urlopen(url) as response:`
			`response_body = response.read()`
			`decoded_body = response_body.decode("utf-8")`
			`soup = BeautifulSoup(decoded_body, "html.parser")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`for item in soup.find_all("a"):`
			`if item.get("href") and item.get("href").startswith("/wiki"):`
			`if "Local_Sitemap" not in item.get(`
			`"href"`
			`) and "Special:" not in item.get("href"):`
Fix various bugs 2024-07-16 08:59:21 +02:00			`new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`hop1_urls.append(new_url)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(new_url)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`return hop1_urls`

			`def save_css(self):`
Refactor 2024-07-18 09:40:18 +02:00			`with urlopen(self.breezewiki_url) as response:`
			`response_body = response.read()`
			`decoded_body = response_body.decode("utf-8")`
			`soup = BeautifulSoup(response_body, "html.parser")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`css_pages = list()`

			`for css in soup.find_all("link"):`
fix: check for .css file extension 2023-09-20 15:31:42 +02:00			`if css.attrs.get("href") and ".css" in css.attrs.get("href"):`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))`
			`css_pages.append(css_url)`

			`for page in css_pages:`
Refactor 2024-07-18 09:40:18 +02:00			`with urlopen(page) as response:`
			`response_body = response.read()`
			`decoded_body = response_body.decode("utf-8")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")`
Refactor 2024-07-18 09:40:18 +02:00			`with open(css_filename, "w") as outfile:`
			`outfile.write(decoded_body)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(css_filename)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`def save_img(self, img_url: str):`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`if not filename.exists():`
Refactor 2024-07-18 09:40:18 +02:00			`urlretrieve(img_url, filename)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(filename)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`def fetch_all_images(self, page_url: str):`
Refactor 2024-07-18 09:40:18 +02:00			`with urlopen(page_url) as response:`
			`response_body = response.read()`
			`decoded_body = response_body.decode("utf-8")`
			`soup = BeautifulSoup(decoded_body, "html.parser")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`img_tags = soup.find_all("img")`
			`img_urls = [img["src"] for img in img_tags if "src" in img.attrs]`
			`clean_img_urls = [`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`]`

			`for img_url in clean_img_urls:`
			`self.save_img(img_url)`

			`def save_page(self, url: str):`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`if not filename.exists():`
Refactor 2024-07-18 09:40:18 +02:00			`with urlopen(url) as response:`
			`response_body = response.read()`
			`decoded_body = response_body.decode("utf-8")`
			`soup = BeautifulSoup(decoded_body, "html.parser")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`stylesheet_count = 0`
			`for link in soup.find_all("link", {"rel": "stylesheet"}):`
			`stylesheet_count += 1`
			`link.decompose()`

			`for i in range(stylesheet_count):`
			`if soup.head:`
			`soup.head.append(`
			`soup.new_tag(`
			`"link",`
			`rel="stylesheet",`
			`type="text/css",`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`href=f"proxy{i}.css",`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`)`
			`)`

			`self.fetch_all_images(url)`

			`soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore`

			`for link in soup.find_all("a"):`
			`if link.get("href") and link.get("href").startswith(`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`f"/{self.name}/wiki"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`):`
			`link_basename = link.get("href").partition("/wiki/")[2]`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`link["href"] = f"{self.site_dir}/{link_basename}.html"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`with open(filename, "w") as outfile:`
			`outfile.write(soup.prettify())`

refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(filename)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`def fetch_all_pages(self, hop1_urls: list):`
			`with concurrent.futures.ThreadPoolExecutor() as executor:`
			`executor.map(self.save_page, hop1_urls)`

Refactor 2024-07-18 09:40:18 +02:00			`def archive_site(self):`
			`try:`
			`with urlopen(self.canonical_url) as response:`
			`response.read()`
			`except HTTPError as http_err:`
			`console.print("Oops. Something went wrong. Likely one of the following:\n")`
			`console.print("(1) The wiki you requested does not exist.")`
			`console.print("(2) You mistyped the name of the wiki.")`
			`console.print("(3) The server hosting that wiki is down for some reason.\n")`
			`console.print(f"HTTP Error: {http_err}")`
			`except URLError as url_err:`
			`console.print(f"URL Error: {url_err}")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):`
			`hop0_urls = self.get_hop0_urls()`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):`
			`hop1_urls = self.get_hop1_urls(hop0_urls)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`self.site_dir.mkdir()`
			`self.images_dir.mkdir()`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`with console.status("Saving CSS files...", spinner="aesthetic"):`
			`self.save_css()`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`with console.status(`
			`"Downloading web pages and/or images...", spinner="aesthetic"`
			`):`
			`self.fetch_all_pages(hop1_urls)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`total_web_files = sum(1 for x in self.site_dir.iterdir() if x.is_file())`
			`total_image_files = sum(1 for x in self.images_dir.iterdir() if x.is_file())`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`with console.status("Archiving images...", spinner="aesthetic"):`
			`shutil.make_archive(f"images-{timestamp}", "xztar", self.images_dir)`
			`shutil.rmtree(self.images_dir)`
			`shutil.move(f"images-{timestamp}.tar.xz", self.site_dir)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`with console.status("Archiving web files...", spinner="aesthetic"):`
			`shutil.make_archive(f"{self.name}-{timestamp}", "gztar", self.site_dir)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`shutil.rmtree(self.site_dir)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00
Refactor 2024-07-18 09:40:18 +02:00			`console.log(f"\nTotal web files archived: {total_web_files}")`
			`console.log(f"Total images archived: {total_image_files}")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00

			`if __name__ == "__main__":`
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`args = docopt(__doc__, options_first=True, help=True, version="1.0.1")`
Refactor 2024-07-18 09:40:18 +02:00
			`site = FandomWiki(args["<fandom>"])`
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`if args["<breezewiki_instance>"]:`
Refactor 2024-07-27 18:39:12 +02:00			`site.set_breezewiki_url(args["<breezewiki_instance>"])`
Refactor 2024-07-18 09:40:18 +02:00
			`site.archive_site()`