archive-fandom-wiki/afw.py
Jeffrey Serio b0498baf8a Refactor
2024-07-18 02:40:18 -05:00

245 lines
8.6 KiB
Python
Executable File

#!/usr/bin/env python
"""archive-fandom-wiki
Usage:
afw <fandom> [<breezewiki_instance>]
afw -h
Options:
-h --help Show this help message.
Examples:
afw dishonored https://breezewiki.nirn.quest
afw residentevil
"""
# This file is formatted with `black -l 79' to comply with PEP8 standards.
#
# urllib.urlopen is used instead of the requests library because I ran
# into URL quoting issues when using requests that are not a problem when
# using urllib.urlopen.
import concurrent.futures
import shutil
import sys
from docopt import docopt
sys.tracebacklimit = 0
from datetime import datetime
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
from rich.console import Console
console = Console()
class FandomWiki:
def __init__(self, name: str):
self.name = name
self.canonical_url = f"https://{name}.fandom.com"
self.breezewiki_url = "https://breezewiki.nirn.quest"
self.site_dir = Path(f"{name}.fandom.com")
self.images_dir = self.site_dir.joinpath("images")
def set_breezewiki_url(self, breezewiki_url: str):
self.breezewiki_url = breezewiki_url
def get_hop0_urls(self) -> list:
starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"
hop0_urls = list()
while True:
with urlopen(starting_url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(decoded_body, "html.parser")
mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]
if (
len(mw_allpages_nav.find_all("a")) < 2
and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
):
break
else:
if len(mw_allpages_nav.find_all("a")) < 2:
starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"
else:
starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"
hop0_urls.append(starting_url)
console.log(starting_url)
return hop0_urls
def get_hop1_urls(self, hop0_urls: list):
hop1_urls = [self.breezewiki_url]
for url in hop0_urls:
with urlopen(url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(decoded_body, "html.parser")
for item in soup.find_all("a"):
if item.get("href") and item.get("href").startswith("/wiki"):
if "Local_Sitemap" not in item.get(
"href"
) and "Special:" not in item.get("href"):
new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}"
hop1_urls.append(new_url)
console.log(new_url)
return hop1_urls
def save_css(self):
with urlopen(self.breezewiki_url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(response_body, "html.parser")
css_pages = list()
for css in soup.find_all("link"):
if css.attrs.get("href") and ".css" in css.attrs.get("href"):
css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))
css_pages.append(css_url)
for page in css_pages:
with urlopen(page) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")
with open(css_filename, "w") as outfile:
outfile.write(decoded_body)
console.log(css_filename)
def save_img(self, img_url: str):
filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
if not filename.exists():
urlretrieve(img_url, filename)
console.log(filename)
def fetch_all_images(self, page_url: str):
with urlopen(page_url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(decoded_body, "html.parser")
img_tags = soup.find_all("img")
img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
clean_img_urls = [
x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x
]
for img_url in clean_img_urls:
self.save_img(img_url)
def save_page(self, url: str):
filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")
if not filename.exists():
with urlopen(url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(decoded_body, "html.parser")
stylesheet_count = 0
for link in soup.find_all("link", {"rel": "stylesheet"}):
stylesheet_count += 1
link.decompose()
for i in range(stylesheet_count):
if soup.head:
soup.head.append(
soup.new_tag(
"link",
rel="stylesheet",
type="text/css",
href=f"proxy{i}.css",
)
)
self.fetch_all_images(url)
soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore
for link in soup.find_all("a"):
if link.get("href") and link.get("href").startswith(
f"/{self.name}/wiki"
):
link_basename = link.get("href").partition("/wiki/")[2]
link["href"] = f"{self.site_dir}/{link_basename}.html"
with open(filename, "w") as outfile:
outfile.write(soup.prettify())
console.log(filename)
def fetch_all_pages(self, hop1_urls: list):
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(self.save_page, hop1_urls)
def archive_site(self):
try:
with urlopen(self.canonical_url) as response:
response.read()
except HTTPError as http_err:
console.print("Oops. Something went wrong. Likely one of the following:\n")
console.print("(1) The wiki you requested does not exist.")
console.print("(2) You mistyped the name of the wiki.")
console.print("(3) The server hosting that wiki is down for some reason.\n")
console.print(f"HTTP Error: {http_err}")
except URLError as url_err:
console.print(f"URL Error: {url_err}")
with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
hop0_urls = self.get_hop0_urls()
with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):
hop1_urls = self.get_hop1_urls(hop0_urls)
self.site_dir.mkdir()
self.images_dir.mkdir()
with console.status("Saving CSS files...", spinner="aesthetic"):
self.save_css()
with console.status(
"Downloading web pages and/or images...", spinner="aesthetic"
):
self.fetch_all_pages(hop1_urls)
total_web_files = sum(1 for x in self.site_dir.iterdir() if x.is_file())
total_image_files = sum(1 for x in self.images_dir.iterdir() if x.is_file())
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
with console.status("Archiving images...", spinner="aesthetic"):
shutil.make_archive(f"images-{timestamp}", "xztar", self.images_dir)
shutil.rmtree(self.images_dir)
shutil.move(f"images-{timestamp}.tar.xz", self.site_dir)
with console.status("Archiving web files...", spinner="aesthetic"):
shutil.make_archive(f"{self.name}-{timestamp}", "gztar", self.site_dir)
shutil.rmtree(self.site_dir)
console.log(f"\nTotal web files archived: {total_web_files}")
console.log(f"Total images archived: {total_image_files}")
if __name__ == "__main__":
args = docopt(__doc__, options_first=True, help=True, version="1.0.1")
site = FandomWiki(args["<fandom>"])
if args["<breezewiki_instance>"]:
site.set_breezewiki_instance(args["<breezewiki_instance>"])
site.archive_site()