This commit is contained in:
Jeffrey Serio 2024-07-18 02:40:18 -05:00
parent 310219bf35
commit b0498baf8a
3 changed files with 94 additions and 216 deletions

View File

@ -15,6 +15,12 @@ source venv/bin/activate
pip install -r requirements.txt
#+end_src
If not using venv:
#+BEGIN_SRC bash
pip install --user -r requirements.txt
ln -sf "$(pwd)/afw.py" ~/.local/bin/afw
#+END_SRC
** Usage
One may specify the BreezeWiki instance URL, or the default value (my BreezeWiki instance URL) will be used.
#+begin_src bash

View File

@ -3,12 +3,11 @@
"""archive-fandom-wiki
Usage:
afw <fandom> <breezewiki_instance>
afw <fandom>
afw <fandom> [<breezewiki_instance>]
afw -h
Options:
-h --help Show this help message.
-v --version Show version.
Examples:
afw dishonored https://breezewiki.nirn.quest
@ -16,6 +15,10 @@ Examples:
"""
# This file is formatted with `black -l 79' to comply with PEP8 standards.
#
# urllib.urlopen is used instead of the requests library because I ran
# into URL quoting issues when using requests that are not a problem when
# using urllib.urlopen.
import concurrent.futures
import shutil
@ -24,51 +27,38 @@ import sys
from docopt import docopt
sys.tracebacklimit = 0
import tarfile
from datetime import datetime
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
from urllib.request import urlopen, urlretrieve
import requests
from bs4 import BeautifulSoup
from rich.console import Console
from rich.progress import Progress
console = Console()
class FandomWiki:
def __init__(self, name: str, breezewiki_url: str):
def __init__(self, name: str):
self.name = name
self.canonical_url = f"https://{name}.fandom.com"
self.breezewiki_url = breezewiki_url
self.breezewiki_url = "https://breezewiki.nirn.quest"
self.site_dir = Path(f"{name}.fandom.com")
self.images_dir = self.site_dir.joinpath("images")
try:
response = requests.get(self.canonical_url)
response.raise_for_status()
except requests.HTTPError as http_err:
console.print("Oops. Something went wrong. Likely one of the following:\n")
console.print("(1) The wiki you requested does not exist.")
console.print("(2) You typed the name of the wiki incorrectly.")
console.print("(3) The server hosting that wiki is down for some reason.\n")
console.print(f"HTTP error: {http_err}")
else:
if not self.site_dir.exists():
self.site_dir.mkdir()
if not self.images_dir.exists():
self.images_dir.mkdir()
def set_breezewiki_url(self, breezewiki_url: str):
self.breezewiki_url = breezewiki_url
def get_hop0_urls(self) -> list:
starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"
hop0_urls = list()
while True:
response = requests.get(starting_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
with urlopen(starting_url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(decoded_body, "html.parser")
mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]
if (
@ -91,9 +81,10 @@ class FandomWiki:
hop1_urls = [self.breezewiki_url]
for url in hop0_urls:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
with urlopen(url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(decoded_body, "html.parser")
for item in soup.find_all("a"):
if item.get("href") and item.get("href").startswith("/wiki"):
@ -107,9 +98,10 @@ class FandomWiki:
return hop1_urls
def save_css(self):
response = requests.get(self.breezewiki_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
with urlopen(self.breezewiki_url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(response_body, "html.parser")
css_pages = list()
for css in soup.find_all("link"):
@ -118,31 +110,27 @@ class FandomWiki:
css_pages.append(css_url)
for page in css_pages:
response = requests.get(page)
response.raise_for_status()
with urlopen(page) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")
with open(css_filename, "wb") as outfile:
outfile.write(response.content)
with open(css_filename, "w") as outfile:
outfile.write(decoded_body)
console.log(css_filename)
def save_img(self, img_url: str):
filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
if not filename.exists():
response = requests.get(img_url, stream=True)
response.raise_for_status()
with open(filename, "wb") as outfile:
for chunk in response.iter_content(chunk_size=8192):
outfile.write(chunk)
urlretrieve(img_url, filename)
console.log(filename)
def fetch_all_images(self, page_url: str):
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
with urlopen(page_url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(decoded_body, "html.parser")
img_tags = soup.find_all("img")
img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
@ -156,9 +144,10 @@ class FandomWiki:
def save_page(self, url: str):
filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")
if not filename.exists():
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
with urlopen(url) as response:
response_body = response.read()
decoded_body = response_body.decode("utf-8")
soup = BeautifulSoup(decoded_body, "html.parser")
stylesheet_count = 0
for link in soup.find_all("link", {"rel": "stylesheet"}):
@ -196,72 +185,60 @@ class FandomWiki:
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(self.save_page, hop1_urls)
def archive(self):
def archive_site(self):
try:
with urlopen(self.canonical_url) as response:
response.read()
except HTTPError as http_err:
console.print("Oops. Something went wrong. Likely one of the following:\n")
console.print("(1) The wiki you requested does not exist.")
console.print("(2) You mistyped the name of the wiki.")
console.print("(3) The server hosting that wiki is down for some reason.\n")
console.print(f"HTTP Error: {http_err}")
except URLError as url_err:
console.print(f"URL Error: {url_err}")
with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
hop0_urls = self.get_hop0_urls()
with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):
hop1_urls = self.get_hop1_urls(hop0_urls)
self.site_dir.mkdir()
self.images_dir.mkdir()
with console.status("Saving CSS files...", spinner="aesthetic"):
self.save_css()
with console.status(
"Downloading web pages and/or images...", spinner="aesthetic"
):
self.fetch_all_pages(hop1_urls)
total_web_files = sum(1 for x in self.site_dir.iterdir() if x.is_file())
total_image_files = sum(1 for x in self.images_dir.iterdir() if x.is_file())
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
img_archive_filename = f"{self.images_dir}-{timestamp}.tar.xz"
num_of_imgs = sum(1 for img in self.images_dir.iterdir() if img.is_file())
with console.status("Archiving images...", spinner="aesthetic"):
shutil.make_archive(f"images-{timestamp}", "xztar", self.images_dir)
shutil.rmtree(self.images_dir)
shutil.move(f"images-{timestamp}.tar.xz", self.site_dir)
with Progress() as progress:
task = progress.add_task("[cyan]Archiving images...", total=num_of_imgs)
with console.status("Archiving web files...", spinner="aesthetic"):
shutil.make_archive(f"{self.name}-{timestamp}", "gztar", self.site_dir)
with tarfile.open(img_archive_filename, "w:xz") as tar:
tar.add(self.images_dir)
progress.update(task, advance=1)
shutil.rmtree(self.site_dir)
progress.stop()
shutil.rmtree(self.images_dir, ignore_errors=True)
web_files = [
f
for f in self.site_dir.iterdir()
if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir()
]
web_archive_filename = f"{self.site_dir}-{timestamp}.tar.gz"
with Progress() as progress:
task = progress.add_task(
"[cyan]Archiving web files...", total=len(web_files)
)
with tarfile.open(web_archive_filename, "w:gz") as tar:
for web_file in web_files:
if progress.finished:
break
tar.add(web_file, arcname=web_file)
progress.update(task, advance=1)
progress.stop()
shutil.rmtree(self.site_dir, ignore_errors=True)
console.log(f"\nTotal web files scraped: {len(web_files)}")
console.log(f"Total images scraped: {num_of_imgs}")
def archive_site(name: str, breezewiki_url: str = "https://breezewiki.nirn.quest"):
site = FandomWiki(name, breezewiki_url)
with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
hop0_urls = site.get_hop0_urls()
with console.status("Fetching hop 1 URLs...", spinner="aesthetic"):
hop1_urls = site.get_hop1_urls(hop0_urls)
with console.status("Saving CSS files...", spinner="aesthetic"):
site.save_css()
with console.status("Downloading images and web pages...", spinner="aesthetic"):
site.fetch_all_pages(hop1_urls)
site.archive()
console.log(f"\nTotal web files archived: {total_web_files}")
console.log(f"Total images archived: {total_image_files}")
if __name__ == "__main__":
args = docopt(__doc__, options_first=True, help=True, version="1.0.1")
site = FandomWiki(args["<fandom>"])
if args["<breezewiki_instance>"]:
archive_site(args["<fandom>"], args["<breezewiki_instance>"])
else:
archive_site(args["<fandom>"])
site.set_breezewiki_instance(args["<breezewiki_instance>"])
site.archive_site()

View File

@ -1,108 +1,3 @@
beautifulsoup4==4.12.2 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da \
--hash=sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a
bs4==0.0.1 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a
certifi==2023.7.22 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082 \
--hash=sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9
charset-normalizer==3.2.0 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96 \
--hash=sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c \
--hash=sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710 \
--hash=sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706 \
--hash=sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020 \
--hash=sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252 \
--hash=sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad \
--hash=sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329 \
--hash=sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a \
--hash=sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f \
--hash=sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6 \
--hash=sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4 \
--hash=sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a \
--hash=sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46 \
--hash=sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2 \
--hash=sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23 \
--hash=sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace \
--hash=sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd \
--hash=sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982 \
--hash=sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10 \
--hash=sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2 \
--hash=sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea \
--hash=sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09 \
--hash=sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5 \
--hash=sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149 \
--hash=sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489 \
--hash=sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9 \
--hash=sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80 \
--hash=sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592 \
--hash=sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3 \
--hash=sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6 \
--hash=sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed \
--hash=sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c \
--hash=sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200 \
--hash=sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a \
--hash=sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e \
--hash=sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d \
--hash=sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6 \
--hash=sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623 \
--hash=sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669 \
--hash=sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3 \
--hash=sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa \
--hash=sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9 \
--hash=sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2 \
--hash=sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f \
--hash=sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1 \
--hash=sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4 \
--hash=sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a \
--hash=sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8 \
--hash=sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3 \
--hash=sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029 \
--hash=sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f \
--hash=sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959 \
--hash=sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22 \
--hash=sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7 \
--hash=sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952 \
--hash=sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346 \
--hash=sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e \
--hash=sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d \
--hash=sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299 \
--hash=sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd \
--hash=sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a \
--hash=sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3 \
--hash=sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037 \
--hash=sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94 \
--hash=sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c \
--hash=sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858 \
--hash=sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a \
--hash=sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449 \
--hash=sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c \
--hash=sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918 \
--hash=sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1 \
--hash=sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c \
--hash=sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac \
--hash=sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa
idna==3.4 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \
--hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2
markdown-it-py==3.0.0 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
--hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
mdurl==0.1.2 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
--hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
pygments==2.16.1 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692 \
--hash=sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29
requests==2.31.0 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
--hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
rich==13.5.2 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:146a90b3b6b47cac4a73c12866a499e9817426423f57c5a66949c086191a8808 \
--hash=sha256:fb9d6c0a0f643c99eed3875b5377a184132ba9be4d61516a55273d3554d75a39
soupsieve==2.5 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \
--hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7
urllib3==2.0.4 ; python_version >= "3.11" and python_version < "4.0" \
--hash=sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11 \
--hash=sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4
beautifulsoup4==4.12.3
docopt==0.6.2
rich==13.7.1