Move archive-fandom-wiki to own repo

2023-09-14 20:26:26 -05:00 · 2023-09-14 20:26:26 -05:00 · 9c9d70b727
commit 9c9d70b727
parent 3dd3a0b6fa
1 changed files with 0 additions and 324 deletions
--- a/324
+++ b/324
@ -1,324 +0,0 @@
-#!/usr/bin/env python
-
-# This program archives the content of fandom wikis.
-#
-# It's pretty much feature-complete. I still have to add detailed comments to
-# describe what each significant piece of code is doing.
-#
-# This program doesn't scrape from the fandom.com wiki sites directly; rather,
-# it uses my BreezeWiki instance to avoid downloading unneccessary ads, images,
-# and other junk.
-#
-# Each resulting archive is self-contained, meaning one can extract the
-# contents and browse the wiki snapshot locally (offline). The URLs for CSS,
-# images, and links in each page are replaced by the file:/// URLs for their
-# corresponding pages on the local filesystem.
-#
-# This file is formatted with `black -l 79' to comply with PEP8 standards.
-
-import concurrent.futures
-import shutil
-import sys
-import tarfile
-from datetime import datetime
-from pathlib import Path
-from urllib.parse import urljoin
-
-import requests
-from bs4 import BeautifulSoup
-from rich.console import Console
-from rich.progress import Progress
-from rich.tree import Tree
-
-console = Console()
-
-
-class FandomWiki:
-    def __init__(self, name: str):
-        self.name = name
-        self.canonical_url = "https://{}.fandom.com".format(name)
-        self.breezewiki_url = "https://wiki.hyperreal.coffee/{}".format(name)
-        self.site_dir = Path.cwd().joinpath("{}.fandom.com".format(name))
-        self.images_dir = self.site_dir.joinpath("images")
-
-        if not self.site_dir.exists():
-            self.site_dir.mkdir()
-
-        if not self.images_dir.exists():
-            self.images_dir.mkdir()
-
-    def get_hop0_urls(self) -> list:
-        starting_url = "{}/wiki/Local_Sitemap".format(self.canonical_url)
-        hop0_urls = list()
-
-        while True:
-            response = requests.get(starting_url)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, "html.parser")
-            mw_allpages_nav = soup.find_all(
-                "div", {"class": "mw-allpages-nav"}
-            )[0]
-
-            if (
-                len(mw_allpages_nav.find_all("a")) < 2
-                and "Next page"
-                not in mw_allpages_nav.find_all("a")[0].get_text()
-            ):
-                break
-            else:
-                if len(mw_allpages_nav.find_all("a")) < 2:
-                    starting_url = "{}{}".format(
-                        self.canonical_url,
-                        mw_allpages_nav.find_all("a")[0].get("href"),
-                    )
-                else:
-                    starting_url = "{}{}".format(
-                        self.canonical_url,
-                        mw_allpages_nav.find_all("a")[1].get("href"),
-                    )
-
-                hop0_urls.append(starting_url)
-                console.print("[[bold]HOP 0[/bold]] {}".format(starting_url))
-
-        return hop0_urls
-
-    def get_hop1_urls(self, hop0_urls: list):
-        hop1_urls = [self.breezewiki_url]
-
-        for url in hop0_urls:
-            response = requests.get(url)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, "html.parser")
-
-            for item in soup.find_all("a"):
-                if item.get("href") and item.get("href").startswith("/wiki"):
-                    if "Local_Sitemap" not in item.get(
-                        "href"
-                    ) and "Special:" not in item.get("href"):
-                        new_url = "{}{}".format(
-                            self.breezewiki_url, item.get("href")
-                        )
-                        hop1_urls.append(new_url)
-                        console.print(
-                            "[[bold]HOP 1[/bold]] {}".format(new_url)
-                        )
-
-        return hop1_urls
-
-    def save_css(self):
-        response = requests.get(self.breezewiki_url)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, "html.parser")
-        css_pages = list()
-
-        for css in soup.find_all("link"):
-            if css.attrs.get("href"):
-                css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))
-                css_pages.append(css_url)
-
-        for page in css_pages:
-            response = requests.get(page)
-            response.raise_for_status()
-
-            css_filename = self.site_dir.joinpath(
-                "proxy{}.css".format(css_pages.index(page))
-            )
-            with open(css_filename, "wb") as outfile:
-                outfile.write(response.content)
-
-            console.print(
-                "[[bold green]CSS[/bold green]] {}".format(css_filename)
-            )
-
-    def save_img(self, img_url: str):
-        filename = self.images_dir.joinpath(
-            Path(img_url.split("/revision")[0]).name
-        )
-        if not filename.exists():
-            response = requests.get(img_url, stream=True)
-            response.raise_for_status()
-
-            with open(filename, "wb") as outfile:
-                for chunk in response.iter_content(chunk_size=8192):
-                    outfile.write(chunk)
-
-            console.print("[[bold green]IMG[/bold green]] {}".format(filename))
-        else:
-            console.print(
-                "[[bold yellow]IMG (EXISTS)[/bold yellow]] {}".format(filename)
-            )
-
-    def fetch_all_images(self, page_url: str):
-        response = requests.get(page_url)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, "html.parser")
-
-        img_tags = soup.find_all("img")
-        img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
-        clean_img_urls = [
-            x
-            for x in img_urls
-            if "breezewiki" not in x and "Wordmark" not in x
-        ]
-
-        for img_url in clean_img_urls:
-            self.save_img(img_url)
-
-    def save_page(self, url: str):
-        filename = self.site_dir.joinpath("{}.html".format(url.split("/")[-1]))
-        if not filename.exists():
-            response = requests.get(url)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, "html.parser")
-
-            stylesheet_count = 0
-            for link in soup.find_all("link", {"rel": "stylesheet"}):
-                stylesheet_count += 1
-                link.decompose()
-
-                for i in range(stylesheet_count):
-                    if soup.head:
-                        soup.head.append(
-                            soup.new_tag(
-                                "link",
-                                rel="stylesheet",
-                                type="text/css",
-                                href="proxy{}.css".format(i),
-                            )
-                        )
-
-            self.fetch_all_images(url)
-
-            soup.find("div", {"class": "bw-top-banner"}).extract()  # type: ignore
-
-            for link in soup.find_all("a"):
-                if link.get("href") and link.get("href").startswith(
-                    "/{}/wiki".format(self.name)
-                ):
-                    link_basename = link.get("href").partition("/wiki/")[2]
-                    link["href"] = "{}/{}.html".format(
-                        self.site_dir, link_basename
-                    )
-
-            with open(filename, "w") as outfile:
-                outfile.write(soup.prettify())
-
-            console.print(
-                "[[bold green]HTML[/bold green]] {}".format(filename)
-            )
-        else:
-            console.print(
-                "[[bold yellow]HTML (EXISTS)[/bold yellow]] {}".format(
-                    filename
-                )
-            )
-
-    def fetch_all_pages(self, hop1_urls: list):
-        self.save_css()
-
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            executor.map(self.save_page, hop1_urls)
-
-    def archive(self):
-        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
-
-        img_files = [
-            f
-            for f in self.images_dir.iterdir()
-            if self.images_dir.joinpath(f).is_file()
-        ]
-
-        img_archive_filename = "{}-{}.tar.xz".format(
-            self.images_dir, timestamp
-        )
-
-        with Progress() as progress:
-            task = progress.add_task(
-                "[cyan]Archiving images...", total=len(img_files)
-            )
-
-            with tarfile.open(img_archive_filename, "w:xz") as tar:
-                for img_file in img_files:
-                    if progress.finished:
-                        break
-                    full_file_path = self.images_dir.joinpath(img_file)
-                    tar.add(full_file_path, arcname=img_file)
-                    progress.update(task, advance=1)
-
-            progress.stop()
-
-        shutil.rmtree(self.images_dir, ignore_errors=True)
-
-        web_files = [
-            f
-            for f in self.site_dir.iterdir()
-            if self.site_dir.joinpath(f).is_file
-            or self.site_dir.joinpath(f).is_dir()
-        ]
-
-        web_archive_filename = "{}-{}.tar.gz".format(self.site_dir, timestamp)
-
-        with Progress() as progress:
-            task = progress.add_task(
-                "[cyan]Archiving web files...", total=len(web_files)
-            )
-
-            with tarfile.open(web_archive_filename, "w:gz") as tar:
-                for web_file in web_files:
-                    if progress.finished:
-                        break
-                    full_file_path = self.site_dir.joinpath(web_file)
-                    tar.add(full_file_path, arcname=web_file)
-                    progress.update(task, advance=1)
-
-            progress.stop()
-
-        shutil.rmtree(self.site_dir, ignore_errors=True)
-
-        console.print("\nTotal web files scraped: {}".format(len(web_files)))
-        console.print("Total images scraped: {}".format(len(img_files)))
-
-
-def archive_site(name: str):
-    site = FandomWiki(name)
-    site.fetch_all_pages(site.get_hop1_urls(site.get_hop0_urls()))
-    site.archive()
-
-
-def usage_message():
-    supported_wikis = [
-        "cyberpunk",
-        "dishonored",
-        "dragonage",
-        "forgottenrealms",
-        "masseffect",
-        "residentevil",
-    ]
-    wiki_tree = Tree("[green]Fandom Wikis")
-    for wiki in supported_wikis:
-        wiki_tree.add(wiki)
-
-    console.print("Usage:\n\tarchive-fandom-wiki [[italic]name[/italic]]\n")
-    console.print("Example:\n\tarchive-fandom-wiki dishonored\n")
-    console.print(wiki_tree)
-
-
-if __name__ == "__main__":
-    if len(sys.argv) > 1:
-        match sys.argv[1]:
-            case "cyberpunk":
-                archive_site("cyberpunk")
-            case "dishonored":
-                archive_site("dishonored")
-            case "dragonage":
-                archive_site("dragonage")
-            case "forgottenrealms":
-                archive_site("forgottenrealms")
-            case "masseffect":
-                archive_site("masseffect")
-            case "residentevil":
-                archive_site("residentevil")
-            case _:
-                usage_message()
-    else:
-        usage_message()