2024-07-27 18:39:12 +02:00
|
|
|
#!/usr/bin/env python3
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2024-04-25 23:37:18 +02:00
|
|
|
"""archive-fandom-wiki
|
|
|
|
|
|
|
|
Usage:
|
2024-08-11 09:46:28 +02:00
|
|
|
afw -f <fandom> [-w <max_workers>] [-b <breezewiki_url>]
|
2024-07-18 09:40:18 +02:00
|
|
|
afw -h
|
2024-04-25 23:37:18 +02:00
|
|
|
|
|
|
|
Options:
|
2024-08-11 09:46:28 +02:00
|
|
|
-f <fandom> The fandom to archive. (Required)
|
|
|
|
-w <max_workers> The maximum number of workers to use for concurrent threads. (Optional; Default is 4)
|
|
|
|
-b <breezewiki_url> The URL of the BreezeWiki instance to use. (Optional; Default is https://breezewiki.hyperreal.coffee)
|
|
|
|
-h --help Show this help message.
|
2024-04-25 23:37:18 +02:00
|
|
|
|
|
|
|
Examples:
|
2024-08-11 09:46:28 +02:00
|
|
|
afw -f dishonored -w 16 -b https://breezewiki.hyperreal.coffee
|
|
|
|
afw -f residentevil
|
2024-04-25 23:37:18 +02:00
|
|
|
"""
|
|
|
|
|
2023-09-15 03:36:56 +02:00
|
|
|
# This file is formatted with `black -l 79' to comply with PEP8 standards.
|
2024-07-18 09:40:18 +02:00
|
|
|
#
|
|
|
|
# urllib.urlopen is used instead of the requests library because I ran
|
|
|
|
# into URL quoting issues when using requests that are not a problem when
|
|
|
|
# using urllib.urlopen.
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
import concurrent.futures
|
|
|
|
import shutil
|
|
|
|
import sys
|
2023-09-16 00:48:20 +02:00
|
|
|
|
2024-04-25 23:37:18 +02:00
|
|
|
from docopt import docopt
|
|
|
|
|
2023-09-16 00:48:20 +02:00
|
|
|
sys.tracebacklimit = 0
|
2023-09-15 03:36:56 +02:00
|
|
|
from datetime import datetime
|
|
|
|
from pathlib import Path
|
2024-07-18 09:40:18 +02:00
|
|
|
from urllib.error import HTTPError, URLError
|
2023-09-15 03:36:56 +02:00
|
|
|
from urllib.parse import urljoin
|
2024-07-18 09:40:18 +02:00
|
|
|
from urllib.request import urlopen, urlretrieve
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from rich.console import Console
|
|
|
|
|
|
|
|
console = Console()
|
|
|
|
|
|
|
|
|
|
|
|
class FandomWiki:
|
2024-07-18 09:40:18 +02:00
|
|
|
def __init__(self, name: str):
|
2023-09-15 03:36:56 +02:00
|
|
|
self.name = name
|
2024-08-11 09:46:28 +02:00
|
|
|
self.canonical_name = f"{name}.fandom.com"
|
|
|
|
self.canonical_url = f"https://{self.canonical_name}"
|
2024-07-27 18:39:12 +02:00
|
|
|
self.breezewiki_url = "https://breezewiki.hyperreal.coffee"
|
2024-08-11 09:46:28 +02:00
|
|
|
self.archive_rootdir = Path.cwd()
|
|
|
|
self.site_dir = self.archive_rootdir.joinpath(f"{self.canonical_name}")
|
2023-09-15 03:36:56 +02:00
|
|
|
self.images_dir = self.site_dir.joinpath("images")
|
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
def set_breezewiki_url(self, breezewiki_url: str):
|
|
|
|
self.breezewiki_url = breezewiki_url
|
2023-09-16 18:48:54 +02:00
|
|
|
|
2023-09-15 03:36:56 +02:00
|
|
|
def get_hop0_urls(self) -> list:
|
2023-09-16 18:48:54 +02:00
|
|
|
starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"
|
2023-09-15 03:36:56 +02:00
|
|
|
hop0_urls = list()
|
|
|
|
|
|
|
|
while True:
|
2024-07-18 09:40:18 +02:00
|
|
|
with urlopen(starting_url) as response:
|
|
|
|
response_body = response.read()
|
|
|
|
decoded_body = response_body.decode("utf-8")
|
|
|
|
soup = BeautifulSoup(decoded_body, "html.parser")
|
2023-09-16 00:43:09 +02:00
|
|
|
mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
if (
|
|
|
|
len(mw_allpages_nav.find_all("a")) < 2
|
2023-09-16 00:43:09 +02:00
|
|
|
and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
|
2023-09-15 03:36:56 +02:00
|
|
|
):
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
if len(mw_allpages_nav.find_all("a")) < 2:
|
2023-09-16 18:48:54 +02:00
|
|
|
starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"
|
2023-09-15 03:36:56 +02:00
|
|
|
else:
|
2023-09-16 18:48:54 +02:00
|
|
|
starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
hop0_urls.append(starting_url)
|
2023-09-16 22:41:09 +02:00
|
|
|
console.log(starting_url)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
return hop0_urls
|
|
|
|
|
|
|
|
def get_hop1_urls(self, hop0_urls: list):
|
|
|
|
hop1_urls = [self.breezewiki_url]
|
|
|
|
|
|
|
|
for url in hop0_urls:
|
2024-07-18 09:40:18 +02:00
|
|
|
with urlopen(url) as response:
|
|
|
|
response_body = response.read()
|
|
|
|
decoded_body = response_body.decode("utf-8")
|
|
|
|
soup = BeautifulSoup(decoded_body, "html.parser")
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
for item in soup.find_all("a"):
|
|
|
|
if item.get("href") and item.get("href").startswith("/wiki"):
|
|
|
|
if "Local_Sitemap" not in item.get(
|
|
|
|
"href"
|
|
|
|
) and "Special:" not in item.get("href"):
|
2024-07-16 08:59:21 +02:00
|
|
|
new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}"
|
2023-09-15 03:36:56 +02:00
|
|
|
hop1_urls.append(new_url)
|
2023-09-16 22:41:09 +02:00
|
|
|
console.log(new_url)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
return hop1_urls
|
|
|
|
|
|
|
|
def save_css(self):
|
2024-07-18 09:40:18 +02:00
|
|
|
with urlopen(self.breezewiki_url) as response:
|
|
|
|
response_body = response.read()
|
|
|
|
decoded_body = response_body.decode("utf-8")
|
|
|
|
soup = BeautifulSoup(response_body, "html.parser")
|
2023-09-15 03:36:56 +02:00
|
|
|
css_pages = list()
|
|
|
|
|
|
|
|
for css in soup.find_all("link"):
|
2023-09-20 15:31:42 +02:00
|
|
|
if css.attrs.get("href") and ".css" in css.attrs.get("href"):
|
2023-09-15 03:36:56 +02:00
|
|
|
css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))
|
|
|
|
css_pages.append(css_url)
|
|
|
|
|
|
|
|
for page in css_pages:
|
2024-07-18 09:40:18 +02:00
|
|
|
with urlopen(page) as response:
|
|
|
|
response_body = response.read()
|
|
|
|
decoded_body = response_body.decode("utf-8")
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2023-09-16 18:48:54 +02:00
|
|
|
css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")
|
2024-07-18 09:40:18 +02:00
|
|
|
with open(css_filename, "w") as outfile:
|
|
|
|
outfile.write(decoded_body)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2023-09-16 22:41:09 +02:00
|
|
|
console.log(css_filename)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
def save_img(self, img_url: str):
|
2023-09-16 00:43:09 +02:00
|
|
|
filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
|
2023-09-15 03:36:56 +02:00
|
|
|
if not filename.exists():
|
2024-07-18 09:40:18 +02:00
|
|
|
urlretrieve(img_url, filename)
|
2023-09-16 22:41:09 +02:00
|
|
|
console.log(filename)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
def fetch_all_images(self, page_url: str):
|
2024-07-18 09:40:18 +02:00
|
|
|
with urlopen(page_url) as response:
|
|
|
|
response_body = response.read()
|
|
|
|
decoded_body = response_body.decode("utf-8")
|
|
|
|
soup = BeautifulSoup(decoded_body, "html.parser")
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
img_tags = soup.find_all("img")
|
|
|
|
img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
|
|
|
|
clean_img_urls = [
|
2023-09-16 00:43:09 +02:00
|
|
|
x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x
|
2023-09-15 03:36:56 +02:00
|
|
|
]
|
|
|
|
|
|
|
|
for img_url in clean_img_urls:
|
|
|
|
self.save_img(img_url)
|
|
|
|
|
|
|
|
def save_page(self, url: str):
|
2023-09-16 18:48:54 +02:00
|
|
|
filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")
|
2023-09-15 03:36:56 +02:00
|
|
|
if not filename.exists():
|
2024-07-18 09:40:18 +02:00
|
|
|
with urlopen(url) as response:
|
|
|
|
response_body = response.read()
|
|
|
|
decoded_body = response_body.decode("utf-8")
|
|
|
|
soup = BeautifulSoup(decoded_body, "html.parser")
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
stylesheet_count = 0
|
|
|
|
for link in soup.find_all("link", {"rel": "stylesheet"}):
|
|
|
|
stylesheet_count += 1
|
|
|
|
link.decompose()
|
|
|
|
|
|
|
|
for i in range(stylesheet_count):
|
|
|
|
if soup.head:
|
|
|
|
soup.head.append(
|
|
|
|
soup.new_tag(
|
|
|
|
"link",
|
|
|
|
rel="stylesheet",
|
|
|
|
type="text/css",
|
2023-09-16 18:48:54 +02:00
|
|
|
href=f"proxy{i}.css",
|
2023-09-15 03:36:56 +02:00
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
self.fetch_all_images(url)
|
|
|
|
|
|
|
|
soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore
|
|
|
|
|
|
|
|
for link in soup.find_all("a"):
|
|
|
|
if link.get("href") and link.get("href").startswith(
|
2023-09-16 18:48:54 +02:00
|
|
|
f"/{self.name}/wiki"
|
2023-09-15 03:36:56 +02:00
|
|
|
):
|
|
|
|
link_basename = link.get("href").partition("/wiki/")[2]
|
2023-09-16 18:48:54 +02:00
|
|
|
link["href"] = f"{self.site_dir}/{link_basename}.html"
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
with open(filename, "w") as outfile:
|
|
|
|
outfile.write(soup.prettify())
|
|
|
|
|
2023-09-16 22:41:09 +02:00
|
|
|
console.log(filename)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
def fetch_all_pages(self, hop1_urls: list):
|
2024-08-11 09:46:28 +02:00
|
|
|
max_workers = int(args["-w"]) if args["-w"] else 4
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2023-09-15 03:36:56 +02:00
|
|
|
executor.map(self.save_page, hop1_urls)
|
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
def archive_site(self):
|
|
|
|
try:
|
|
|
|
with urlopen(self.canonical_url) as response:
|
|
|
|
response.read()
|
|
|
|
except HTTPError as http_err:
|
|
|
|
console.print("Oops. Something went wrong. Likely one of the following:\n")
|
|
|
|
console.print("(1) The wiki you requested does not exist.")
|
|
|
|
console.print("(2) You mistyped the name of the wiki.")
|
|
|
|
console.print("(3) The server hosting that wiki is down for some reason.\n")
|
|
|
|
console.print(f"HTTP Error: {http_err}")
|
|
|
|
except URLError as url_err:
|
|
|
|
console.print(f"URL Error: {url_err}")
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
|
|
|
|
hop0_urls = self.get_hop0_urls()
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):
|
|
|
|
hop1_urls = self.get_hop1_urls(hop0_urls)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2024-08-11 09:46:28 +02:00
|
|
|
# Creates the parent dirs: self.archive_rootdir > self.site_dir > self.images_dir
|
|
|
|
self.images_dir.mkdir(parents=True)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
with console.status("Saving CSS files...", spinner="aesthetic"):
|
|
|
|
self.save_css()
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
with console.status(
|
|
|
|
"Downloading web pages and/or images...", spinner="aesthetic"
|
|
|
|
):
|
|
|
|
self.fetch_all_pages(hop1_urls)
|
2023-09-15 03:36:56 +02:00
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
total_web_files = sum(1 for x in self.site_dir.iterdir() if x.is_file())
|
|
|
|
total_image_files = sum(1 for x in self.images_dir.iterdir() if x.is_file())
|
2023-09-16 22:41:09 +02:00
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
2023-09-16 22:41:09 +02:00
|
|
|
|
2024-09-18 05:27:48 +02:00
|
|
|
with console.status("tar.gzipping downloaded files...", spinner="aesthetic"):
|
2024-08-11 09:46:28 +02:00
|
|
|
shutil.make_archive(
|
2024-09-18 05:27:48 +02:00
|
|
|
f"{self.name}-{timestamp}",
|
|
|
|
"gztar",
|
|
|
|
root_dir=self.archive_rootdir,
|
|
|
|
base_dir=self.canonical_name,
|
2024-08-11 09:46:28 +02:00
|
|
|
)
|
2023-09-16 22:41:09 +02:00
|
|
|
|
2024-09-18 05:27:48 +02:00
|
|
|
with console.status("zipping downloaded files...", spinner="aesthetic"):
|
2024-08-11 09:46:28 +02:00
|
|
|
shutil.make_archive(
|
|
|
|
f"{self.name}-{timestamp}",
|
2024-09-18 05:27:48 +02:00
|
|
|
"zip",
|
2024-08-11 09:46:28 +02:00
|
|
|
root_dir=self.archive_rootdir,
|
|
|
|
base_dir=self.canonical_name,
|
|
|
|
)
|
2024-09-18 05:27:48 +02:00
|
|
|
|
|
|
|
shutil.rmtree(self.site_dir)
|
2023-09-16 22:41:09 +02:00
|
|
|
|
2024-07-18 09:40:18 +02:00
|
|
|
console.log(f"\nTotal web files archived: {total_web_files}")
|
|
|
|
console.log(f"Total images archived: {total_image_files}")
|
2023-09-15 03:36:56 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-08-11 09:46:28 +02:00
|
|
|
args = docopt(__doc__, options_first=True, help=True, version="1.0.1") # type: ignore
|
2024-07-18 09:40:18 +02:00
|
|
|
|
2024-08-11 09:46:28 +02:00
|
|
|
site = FandomWiki(args["-f"])
|
|
|
|
if args["-b"]:
|
|
|
|
site.set_breezewiki_url(args["-b"])
|
2024-07-18 09:40:18 +02:00
|
|
|
|
|
|
|
site.archive_site()
|