archive-fandom-wiki/afw

#!/usr/bin/env python

"""archive-fandom-wiki

Usage:
    afw <fandom> <breezewiki_instance>
    afw <fandom>

Options:
    -h --help       Show this help message.
    -v --version    Show version.

Examples:
    afw dishonored https://breezewiki.nirn.quest
    afw residentevil
"""

# This file is formatted with `black -l 79' to comply with PEP8 standards.

import concurrent.futures
import shutil
import sys

from docopt import docopt

sys.tracebacklimit = 0
import tarfile
from datetime import datetime
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from rich.console import Console
from rich.progress import Progress

console = Console()


class FandomWiki:
    def __init__(self, name: str, breezewiki_url: str):
        self.name = name
        self.canonical_url = f"https://{name}.fandom.com"
        self.breezewiki_url = breezewiki_url
        self.site_dir = Path(f"{name}.fandom.com")
        self.images_dir = self.site_dir.joinpath("images")

        try:
            response = requests.get(self.canonical_url)
            response.raise_for_status()
        except requests.HTTPError as http_err:
            console.print("Oops. Something went wrong. Likely one of the following:\n")
            console.print("(1) The wiki you requested does not exist.")
            console.print("(2) You typed the name of the wiki incorrectly.")
            console.print("(3) The server hosting that wiki is down for some reason.\n")
            console.print(f"HTTP error: {http_err}")
        else:
            if not self.site_dir.exists():
                self.site_dir.mkdir()

            if not self.images_dir.exists():
                self.images_dir.mkdir()

    def get_hop0_urls(self) -> list:
        starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"
        hop0_urls = list()

        while True:
            response = requests.get(starting_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]

            if (
                len(mw_allpages_nav.find_all("a")) < 2
                and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
            ):
                break
            else:
                if len(mw_allpages_nav.find_all("a")) < 2:
                    starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"
                else:
                    starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"

                hop0_urls.append(starting_url)
                console.log(starting_url)

        return hop0_urls

    def get_hop1_urls(self, hop0_urls: list):
        hop1_urls = [self.breezewiki_url]

        for url in hop0_urls:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            for item in soup.find_all("a"):
                if item.get("href") and item.get("href").startswith("/wiki"):
                    if "Local_Sitemap" not in item.get(
                        "href"
                    ) and "Special:" not in item.get("href"):
                        new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}"
                        hop1_urls.append(new_url)
                        console.log(new_url)

        return hop1_urls

    def save_css(self):
        response = requests.get(self.breezewiki_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        css_pages = list()

        for css in soup.find_all("link"):
            if css.attrs.get("href") and ".css" in css.attrs.get("href"):
                css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))
                css_pages.append(css_url)

        for page in css_pages:
            response = requests.get(page)
            response.raise_for_status()

            css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")
            with open(css_filename, "wb") as outfile:
                outfile.write(response.content)

            console.log(css_filename)

    def save_img(self, img_url: str):
        filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
        if not filename.exists():
            response = requests.get(img_url, stream=True)
            response.raise_for_status()

            with open(filename, "wb") as outfile:
                for chunk in response.iter_content(chunk_size=8192):
                    outfile.write(chunk)

            console.log(filename)

    def fetch_all_images(self, page_url: str):
        response = requests.get(page_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        img_tags = soup.find_all("img")
        img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
        clean_img_urls = [
            x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x
        ]

        for img_url in clean_img_urls:
            self.save_img(img_url)

    def save_page(self, url: str):
        filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")
        if not filename.exists():
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            stylesheet_count = 0
            for link in soup.find_all("link", {"rel": "stylesheet"}):
                stylesheet_count += 1
                link.decompose()

                for i in range(stylesheet_count):
                    if soup.head:
                        soup.head.append(
                            soup.new_tag(
                                "link",
                                rel="stylesheet",
                                type="text/css",
                                href=f"proxy{i}.css",
                            )
                        )

            self.fetch_all_images(url)

            soup.find("div", {"class": "bw-top-banner"}).extract()  # type: ignore

            for link in soup.find_all("a"):
                if link.get("href") and link.get("href").startswith(
                    f"/{self.name}/wiki"
                ):
                    link_basename = link.get("href").partition("/wiki/")[2]
                    link["href"] = f"{self.site_dir}/{link_basename}.html"

            with open(filename, "w") as outfile:
                outfile.write(soup.prettify())

            console.log(filename)

    def fetch_all_pages(self, hop1_urls: list):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            executor.map(self.save_page, hop1_urls)

    def archive(self):
        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

        img_archive_filename = f"{self.images_dir}-{timestamp}.tar.xz"
        num_of_imgs = sum(1 for img in self.images_dir.iterdir() if img.is_file())

        with Progress() as progress:
            task = progress.add_task("[cyan]Archiving images...", total=num_of_imgs)

            with tarfile.open(img_archive_filename, "w:xz") as tar:
                tar.add(self.images_dir)
                progress.update(task, advance=1)

            progress.stop()

        shutil.rmtree(self.images_dir, ignore_errors=True)

        web_files = [
            f
            for f in self.site_dir.iterdir()
            if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir()
        ]

        web_archive_filename = f"{self.site_dir}-{timestamp}.tar.gz"

        with Progress() as progress:
            task = progress.add_task(
                "[cyan]Archiving web files...", total=len(web_files)
            )

            with tarfile.open(web_archive_filename, "w:gz") as tar:
                for web_file in web_files:
                    if progress.finished:
                        break
                    tar.add(web_file, arcname=web_file)
                    progress.update(task, advance=1)

            progress.stop()

        shutil.rmtree(self.site_dir, ignore_errors=True)

        console.log(f"\nTotal web files scraped: {len(web_files)}")
        console.log(f"Total images scraped: {num_of_imgs}")


def archive_site(name: str, breezewiki_url: str = "https://breezewiki.nirn.quest"):
    site = FandomWiki(name, breezewiki_url)

    with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
        hop0_urls = site.get_hop0_urls()

    with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):
        hop1_urls = site.get_hop1_urls(hop0_urls)

    with console.status("Saving CSS files...", spinner="aesthetic"):
        site.save_css()

    with console.status("Downloading images and web pages...", spinner="aesthetic"):
        site.fetch_all_pages(hop1_urls)

    site.archive()


if __name__ == "__main__":
    args = docopt(__doc__, options_first=True, help=True, version="1.0.1")
    if args["<breezewiki_instance>"]:
        archive_site(args["<fandom>"], args["<breezewiki_instance>"])
    else:
        archive_site(args["<fandom>"])
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`#!/usr/bin/env python`

Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`"""archive-fandom-wiki`

			`Usage:`
			`afw <fandom> <breezewiki_instance>`
			`afw <fandom>`

			`Options:`
			`-h --help Show this help message.`
			`-v --version Show version.`

			`Examples:`
			`afw dishonored https://breezewiki.nirn.quest`
			`afw residentevil`
			`"""`

Initial commit or whatever 2023-09-15 03:36:56 +02:00			# This file is formatted with `black -l 79' to comply with PEP8 standards.

			`import concurrent.futures`
			`import shutil`
			`import sys`
feat: support all wikis on fandom.com 2023-09-16 00:48:20 +02:00
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`from docopt import docopt`

feat: support all wikis on fandom.com 2023-09-16 00:48:20 +02:00			`sys.tracebacklimit = 0`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`import tarfile`
			`from datetime import datetime`
			`from pathlib import Path`
			`from urllib.parse import urljoin`

			`import requests`
			`from bs4 import BeautifulSoup`
			`from rich.console import Console`
			`from rich.progress import Progress`

			`console = Console()`


			`class FandomWiki:`
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`def __init__(self, name: str, breezewiki_url: str):`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`self.name = name`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`self.canonical_url = f"https://{name}.fandom.com"`
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`self.breezewiki_url = breezewiki_url`
Fix various bugs 2024-07-16 08:59:21 +02:00			`self.site_dir = Path(f"{name}.fandom.com")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`self.images_dir = self.site_dir.joinpath("images")`

feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`try:`
			`response = requests.get(self.canonical_url)`
			`response.raise_for_status()`
			`except requests.HTTPError as http_err:`
feat: support all wikis on fandom.com 2023-09-16 00:48:20 +02:00			`console.print("Oops. Something went wrong. Likely one of the following:\n")`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`console.print("(1) The wiki you requested does not exist.")`
			`console.print("(2) You typed the name of the wiki incorrectly.")`
feat: support all wikis on fandom.com 2023-09-16 00:48:20 +02:00			`console.print("(3) The server hosting that wiki is down for some reason.\n")`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`console.print(f"HTTP error: {http_err}")`
fix: put mkdir in else clause for init 2023-09-16 22:05:28 +02:00			`else:`
			`if not self.site_dir.exists():`
			`self.site_dir.mkdir()`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00
fix: put mkdir in else clause for init 2023-09-16 22:05:28 +02:00			`if not self.images_dir.exists():`
			`self.images_dir.mkdir()`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`def get_hop0_urls(self) -> list:`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`hop0_urls = list()`

			`while True:`
			`response = requests.get(starting_url)`
			`response.raise_for_status()`
			`soup = BeautifulSoup(response.text, "html.parser")`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`if (`
			`len(mw_allpages_nav.find_all("a")) < 2`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`):`
			`break`
			`else:`
			`if len(mw_allpages_nav.find_all("a")) < 2:`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`else:`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`hop0_urls.append(starting_url)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(starting_url)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`return hop0_urls`

			`def get_hop1_urls(self, hop0_urls: list):`
			`hop1_urls = [self.breezewiki_url]`

			`for url in hop0_urls:`
			`response = requests.get(url)`
			`response.raise_for_status()`
			`soup = BeautifulSoup(response.text, "html.parser")`

			`for item in soup.find_all("a"):`
			`if item.get("href") and item.get("href").startswith("/wiki"):`
			`if "Local_Sitemap" not in item.get(`
			`"href"`
			`) and "Special:" not in item.get("href"):`
Fix various bugs 2024-07-16 08:59:21 +02:00			`new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`hop1_urls.append(new_url)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(new_url)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`return hop1_urls`

			`def save_css(self):`
			`response = requests.get(self.breezewiki_url)`
			`response.raise_for_status()`
			`soup = BeautifulSoup(response.text, "html.parser")`
			`css_pages = list()`

			`for css in soup.find_all("link"):`
fix: check for .css file extension 2023-09-20 15:31:42 +02:00			`if css.attrs.get("href") and ".css" in css.attrs.get("href"):`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))`
			`css_pages.append(css_url)`

			`for page in css_pages:`
			`response = requests.get(page)`
			`response.raise_for_status()`

Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`with open(css_filename, "wb") as outfile:`
			`outfile.write(response.content)`

refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(css_filename)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`def save_img(self, img_url: str):`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`if not filename.exists():`
			`response = requests.get(img_url, stream=True)`
			`response.raise_for_status()`

			`with open(filename, "wb") as outfile:`
			`for chunk in response.iter_content(chunk_size=8192):`
			`outfile.write(chunk)`

refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(filename)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`def fetch_all_images(self, page_url: str):`
			`response = requests.get(page_url)`
			`response.raise_for_status()`
			`soup = BeautifulSoup(response.content, "html.parser")`

			`img_tags = soup.find_all("img")`
			`img_urls = [img["src"] for img in img_tags if "src" in img.attrs]`
			`clean_img_urls = [`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`]`

			`for img_url in clean_img_urls:`
			`self.save_img(img_url)`

			`def save_page(self, url: str):`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`if not filename.exists():`
			`response = requests.get(url)`
			`response.raise_for_status()`
			`soup = BeautifulSoup(response.text, "html.parser")`

			`stylesheet_count = 0`
			`for link in soup.find_all("link", {"rel": "stylesheet"}):`
			`stylesheet_count += 1`
			`link.decompose()`

			`for i in range(stylesheet_count):`
			`if soup.head:`
			`soup.head.append(`
			`soup.new_tag(`
			`"link",`
			`rel="stylesheet",`
			`type="text/css",`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`href=f"proxy{i}.css",`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`)`
			`)`

			`self.fetch_all_images(url)`

			`soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore`

			`for link in soup.find_all("a"):`
			`if link.get("href") and link.get("href").startswith(`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`f"/{self.name}/wiki"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`):`
			`link_basename = link.get("href").partition("/wiki/")[2]`
Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`link["href"] = f"{self.site_dir}/{link_basename}.html"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`with open(filename, "w") as outfile:`
			`outfile.write(soup.prettify())`

refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(filename)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`def fetch_all_pages(self, hop1_urls: list):`
			`with concurrent.futures.ThreadPoolExecutor() as executor:`
			`executor.map(self.save_page, hop1_urls)`

			`def archive(self):`
			`timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")`

Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`img_archive_filename = f"{self.images_dir}-{timestamp}.tar.xz"`
Fix various bugs 2024-07-16 08:59:21 +02:00			`num_of_imgs = sum(1 for img in self.images_dir.iterdir() if img.is_file())`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`with Progress() as progress:`
Fix various bugs 2024-07-16 08:59:21 +02:00			`task = progress.add_task("[cyan]Archiving images...", total=num_of_imgs)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`with tarfile.open(img_archive_filename, "w:xz") as tar:`
Fix various bugs 2024-07-16 08:59:21 +02:00			`tar.add(self.images_dir)`
			`progress.update(task, advance=1)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`progress.stop()`

			`shutil.rmtree(self.images_dir, ignore_errors=True)`

			`web_files = [`
			`f`
			`for f in self.site_dir.iterdir()`
feat: support all wikis on fandom.com 2023-09-16 00:43:09 +02:00			`if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir()`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`]`

Use f-strings instead of .format() 2023-09-16 18:48:54 +02:00			`web_archive_filename = f"{self.site_dir}-{timestamp}.tar.gz"`
Initial commit or whatever 2023-09-15 03:36:56 +02:00
			`with Progress() as progress:`
			`task = progress.add_task(`
			`"[cyan]Archiving web files...", total=len(web_files)`
			`)`

			`with tarfile.open(web_archive_filename, "w:gz") as tar:`
			`for web_file in web_files:`
			`if progress.finished:`
			`break`
Fix various bugs 2024-07-16 08:59:21 +02:00			`tar.add(web_file, arcname=web_file)`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`progress.update(task, advance=1)`

			`progress.stop()`

			`shutil.rmtree(self.site_dir, ignore_errors=True)`

refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00			`console.log(f"\nTotal web files scraped: {len(web_files)}")`
Fix various bugs 2024-07-16 08:59:21 +02:00			`console.log(f"Total images scraped: {num_of_imgs}")`
Initial commit or whatever 2023-09-15 03:36:56 +02:00

Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`def archive_site(name: str, breezewiki_url: str = "https://breezewiki.nirn.quest"):`
			`site = FandomWiki(name, breezewiki_url)`
refactor: use rich.status and console.log 2023-09-16 22:41:09 +02:00
			`with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):`
			`hop0_urls = site.get_hop0_urls()`

			`with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):`
			`hop1_urls = site.get_hop1_urls(hop0_urls)`

			`with console.status("Saving CSS files...", spinner="aesthetic"):`
			`site.save_css()`

			`with console.status("Downloading images and web pages...", spinner="aesthetic"):`
			`site.fetch_all_pages(hop1_urls)`

Initial commit or whatever 2023-09-15 03:36:56 +02:00			`site.archive()`


			`if __name__ == "__main__":`
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`args = docopt(__doc__, options_first=True, help=True, version="1.0.1")`
			`if args["<breezewiki_instance>"]:`
			`archive_site(args["<fandom>"], args["<breezewiki_instance>"])`
Initial commit or whatever 2023-09-15 03:36:56 +02:00			`else:`
Fix: allow to specify BreezeWiki instance URL 2024-04-25 23:37:18 +02:00			`archive_site(args["<fandom>"])`