archive-fandom-wiki/afw

268 lines
9.0 KiB
Plaintext
Raw Normal View History

2023-09-15 03:36:56 +02:00
#!/usr/bin/env python
"""archive-fandom-wiki
Usage:
afw <fandom> <breezewiki_instance>
afw <fandom>
Options:
-h --help Show this help message.
-v --version Show version.
Examples:
afw dishonored https://breezewiki.nirn.quest
afw residentevil
"""
2023-09-15 03:36:56 +02:00
# This file is formatted with `black -l 79' to comply with PEP8 standards.
import concurrent.futures
import shutil
import sys
2023-09-16 00:48:20 +02:00
from docopt import docopt
2023-09-16 00:48:20 +02:00
sys.tracebacklimit = 0
2023-09-15 03:36:56 +02:00
import tarfile
from datetime import datetime
from pathlib import Path
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from rich.console import Console
from rich.progress import Progress
console = Console()
class FandomWiki:
def __init__(self, name: str, breezewiki_url: str):
2023-09-15 03:36:56 +02:00
self.name = name
2023-09-16 18:48:54 +02:00
self.canonical_url = f"https://{name}.fandom.com"
self.breezewiki_url = breezewiki_url
2024-07-16 08:59:21 +02:00
self.site_dir = Path(f"{name}.fandom.com")
2023-09-15 03:36:56 +02:00
self.images_dir = self.site_dir.joinpath("images")
2023-09-16 00:43:09 +02:00
try:
response = requests.get(self.canonical_url)
response.raise_for_status()
except requests.HTTPError as http_err:
2023-09-16 00:48:20 +02:00
console.print("Oops. Something went wrong. Likely one of the following:\n")
2023-09-16 00:43:09 +02:00
console.print("(1) The wiki you requested does not exist.")
console.print("(2) You typed the name of the wiki incorrectly.")
2023-09-16 00:48:20 +02:00
console.print("(3) The server hosting that wiki is down for some reason.\n")
2023-09-16 00:43:09 +02:00
console.print(f"HTTP error: {http_err}")
2023-09-16 22:05:28 +02:00
else:
if not self.site_dir.exists():
self.site_dir.mkdir()
2023-09-16 00:43:09 +02:00
2023-09-16 22:05:28 +02:00
if not self.images_dir.exists():
self.images_dir.mkdir()
2023-09-16 18:48:54 +02:00
2023-09-15 03:36:56 +02:00
def get_hop0_urls(self) -> list:
2023-09-16 18:48:54 +02:00
starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"
2023-09-15 03:36:56 +02:00
hop0_urls = list()
while True:
response = requests.get(starting_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
2023-09-16 00:43:09 +02:00
mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]
2023-09-15 03:36:56 +02:00
if (
len(mw_allpages_nav.find_all("a")) < 2
2023-09-16 00:43:09 +02:00
and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
2023-09-15 03:36:56 +02:00
):
break
else:
if len(mw_allpages_nav.find_all("a")) < 2:
2023-09-16 18:48:54 +02:00
starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"
2023-09-15 03:36:56 +02:00
else:
2023-09-16 18:48:54 +02:00
starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"
2023-09-15 03:36:56 +02:00
hop0_urls.append(starting_url)
console.log(starting_url)
2023-09-15 03:36:56 +02:00
return hop0_urls
def get_hop1_urls(self, hop0_urls: list):
hop1_urls = [self.breezewiki_url]
for url in hop0_urls:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
for item in soup.find_all("a"):
if item.get("href") and item.get("href").startswith("/wiki"):
if "Local_Sitemap" not in item.get(
"href"
) and "Special:" not in item.get("href"):
2024-07-16 08:59:21 +02:00
new_url = f"{self.breezewiki_url}/{self.name}{item.get('href')}"
2023-09-15 03:36:56 +02:00
hop1_urls.append(new_url)
console.log(new_url)
2023-09-15 03:36:56 +02:00
return hop1_urls
def save_css(self):
response = requests.get(self.breezewiki_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
css_pages = list()
for css in soup.find_all("link"):
2023-09-20 15:31:42 +02:00
if css.attrs.get("href") and ".css" in css.attrs.get("href"):
2023-09-15 03:36:56 +02:00
css_url = urljoin(self.breezewiki_url, css.attrs.get("href"))
css_pages.append(css_url)
for page in css_pages:
response = requests.get(page)
response.raise_for_status()
2023-09-16 18:48:54 +02:00
css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")
2023-09-15 03:36:56 +02:00
with open(css_filename, "wb") as outfile:
outfile.write(response.content)
console.log(css_filename)
2023-09-15 03:36:56 +02:00
def save_img(self, img_url: str):
2023-09-16 00:43:09 +02:00
filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
2023-09-15 03:36:56 +02:00
if not filename.exists():
response = requests.get(img_url, stream=True)
response.raise_for_status()
with open(filename, "wb") as outfile:
for chunk in response.iter_content(chunk_size=8192):
outfile.write(chunk)
console.log(filename)
2023-09-15 03:36:56 +02:00
def fetch_all_images(self, page_url: str):
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
img_tags = soup.find_all("img")
img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
clean_img_urls = [
2023-09-16 00:43:09 +02:00
x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x
2023-09-15 03:36:56 +02:00
]
for img_url in clean_img_urls:
self.save_img(img_url)
def save_page(self, url: str):
2023-09-16 18:48:54 +02:00
filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")
2023-09-15 03:36:56 +02:00
if not filename.exists():
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
stylesheet_count = 0
for link in soup.find_all("link", {"rel": "stylesheet"}):
stylesheet_count += 1
link.decompose()
for i in range(stylesheet_count):
if soup.head:
soup.head.append(
soup.new_tag(
"link",
rel="stylesheet",
type="text/css",
2023-09-16 18:48:54 +02:00
href=f"proxy{i}.css",
2023-09-15 03:36:56 +02:00
)
)
self.fetch_all_images(url)
soup.find("div", {"class": "bw-top-banner"}).extract() # type: ignore
for link in soup.find_all("a"):
if link.get("href") and link.get("href").startswith(
2023-09-16 18:48:54 +02:00
f"/{self.name}/wiki"
2023-09-15 03:36:56 +02:00
):
link_basename = link.get("href").partition("/wiki/")[2]
2023-09-16 18:48:54 +02:00
link["href"] = f"{self.site_dir}/{link_basename}.html"
2023-09-15 03:36:56 +02:00
with open(filename, "w") as outfile:
outfile.write(soup.prettify())
console.log(filename)
2023-09-15 03:36:56 +02:00
def fetch_all_pages(self, hop1_urls: list):
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(self.save_page, hop1_urls)
def archive(self):
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
2023-09-16 18:48:54 +02:00
img_archive_filename = f"{self.images_dir}-{timestamp}.tar.xz"
2024-07-16 08:59:21 +02:00
num_of_imgs = sum(1 for img in self.images_dir.iterdir() if img.is_file())
2023-09-15 03:36:56 +02:00
with Progress() as progress:
2024-07-16 08:59:21 +02:00
task = progress.add_task("[cyan]Archiving images...", total=num_of_imgs)
2023-09-15 03:36:56 +02:00
with tarfile.open(img_archive_filename, "w:xz") as tar:
2024-07-16 08:59:21 +02:00
tar.add(self.images_dir)
progress.update(task, advance=1)
2023-09-15 03:36:56 +02:00
progress.stop()
shutil.rmtree(self.images_dir, ignore_errors=True)
web_files = [
f
for f in self.site_dir.iterdir()
2023-09-16 00:43:09 +02:00
if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir()
2023-09-15 03:36:56 +02:00
]
2023-09-16 18:48:54 +02:00
web_archive_filename = f"{self.site_dir}-{timestamp}.tar.gz"
2023-09-15 03:36:56 +02:00
with Progress() as progress:
task = progress.add_task(
"[cyan]Archiving web files...", total=len(web_files)
)
with tarfile.open(web_archive_filename, "w:gz") as tar:
for web_file in web_files:
if progress.finished:
break
2024-07-16 08:59:21 +02:00
tar.add(web_file, arcname=web_file)
2023-09-15 03:36:56 +02:00
progress.update(task, advance=1)
progress.stop()
shutil.rmtree(self.site_dir, ignore_errors=True)
console.log(f"\nTotal web files scraped: {len(web_files)}")
2024-07-16 08:59:21 +02:00
console.log(f"Total images scraped: {num_of_imgs}")
2023-09-15 03:36:56 +02:00
def archive_site(name: str, breezewiki_url: str = "https://breezewiki.nirn.quest"):
site = FandomWiki(name, breezewiki_url)
with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
hop0_urls = site.get_hop0_urls()
with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):
hop1_urls = site.get_hop1_urls(hop0_urls)
with console.status("Saving CSS files...", spinner="aesthetic"):
site.save_css()
with console.status("Downloading images and web pages...", spinner="aesthetic"):
site.fetch_all_pages(hop1_urls)
2023-09-15 03:36:56 +02:00
site.archive()
if __name__ == "__main__":
args = docopt(__doc__, options_first=True, help=True, version="1.0.1")
if args["<breezewiki_instance>"]:
archive_site(args["<fandom>"], args["<breezewiki_instance>"])
2023-09-15 03:36:56 +02:00
else:
archive_site(args["<fandom>"])