From b0498baf8a5410b23dc97bfa4267ee3b492307ce Mon Sep 17 00:00:00 2001 From: Jeffrey Serio <23226432+hyperreal64@users.noreply.github.com> Date: Thu, 18 Jul 2024 02:40:18 -0500 Subject: [PATCH] Refactor --- README.org | 6 ++ afw => afw.py | 193 +++++++++++++++++++++-------------------------- requirements.txt | 111 +-------------------------- 3 files changed, 94 insertions(+), 216 deletions(-) rename afw => afw.py (56%) diff --git a/README.org b/README.org index b20a621..9e26bf3 100644 --- a/README.org +++ b/README.org @@ -15,6 +15,12 @@ source venv/bin/activate pip install -r requirements.txt #+end_src +If not using venv: +#+BEGIN_SRC bash +pip install --user -r requirements.txt +ln -sf "$(pwd)/afw.py" ~/.local/bin/afw +#+END_SRC + ** Usage One may specify the BreezeWiki instance URL, or the default value (my BreezeWiki instance URL) will be used. #+begin_src bash diff --git a/afw b/afw.py similarity index 56% rename from afw rename to afw.py index 375bcfb..4daf65c 100755 --- a/afw +++ b/afw.py @@ -3,12 +3,11 @@ """archive-fandom-wiki Usage: - afw - afw + afw [] + afw -h Options: -h --help Show this help message. - -v --version Show version. Examples: afw dishonored https://breezewiki.nirn.quest @@ -16,6 +15,10 @@ Examples: """ # This file is formatted with `black -l 79' to comply with PEP8 standards. +# +# urllib.urlopen is used instead of the requests library because I ran +# into URL quoting issues when using requests that are not a problem when +# using urllib.urlopen. import concurrent.futures import shutil @@ -24,51 +27,38 @@ import sys from docopt import docopt sys.tracebacklimit = 0 -import tarfile from datetime import datetime from pathlib import Path +from urllib.error import HTTPError, URLError from urllib.parse import urljoin +from urllib.request import urlopen, urlretrieve -import requests from bs4 import BeautifulSoup from rich.console import Console -from rich.progress import Progress console = Console() class FandomWiki: - def __init__(self, name: str, breezewiki_url: str): + def __init__(self, name: str): self.name = name self.canonical_url = f"https://{name}.fandom.com" - self.breezewiki_url = breezewiki_url + self.breezewiki_url = "https://breezewiki.nirn.quest" self.site_dir = Path(f"{name}.fandom.com") self.images_dir = self.site_dir.joinpath("images") - try: - response = requests.get(self.canonical_url) - response.raise_for_status() - except requests.HTTPError as http_err: - console.print("Oops. Something went wrong. Likely one of the following:\n") - console.print("(1) The wiki you requested does not exist.") - console.print("(2) You typed the name of the wiki incorrectly.") - console.print("(3) The server hosting that wiki is down for some reason.\n") - console.print(f"HTTP error: {http_err}") - else: - if not self.site_dir.exists(): - self.site_dir.mkdir() - - if not self.images_dir.exists(): - self.images_dir.mkdir() + def set_breezewiki_url(self, breezewiki_url: str): + self.breezewiki_url = breezewiki_url def get_hop0_urls(self) -> list: starting_url = f"{self.canonical_url}/wiki/Local_Sitemap" hop0_urls = list() while True: - response = requests.get(starting_url) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") + with urlopen(starting_url) as response: + response_body = response.read() + decoded_body = response_body.decode("utf-8") + soup = BeautifulSoup(decoded_body, "html.parser") mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0] if ( @@ -91,9 +81,10 @@ class FandomWiki: hop1_urls = [self.breezewiki_url] for url in hop0_urls: - response = requests.get(url) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") + with urlopen(url) as response: + response_body = response.read() + decoded_body = response_body.decode("utf-8") + soup = BeautifulSoup(decoded_body, "html.parser") for item in soup.find_all("a"): if item.get("href") and item.get("href").startswith("/wiki"): @@ -107,9 +98,10 @@ class FandomWiki: return hop1_urls def save_css(self): - response = requests.get(self.breezewiki_url) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") + with urlopen(self.breezewiki_url) as response: + response_body = response.read() + decoded_body = response_body.decode("utf-8") + soup = BeautifulSoup(response_body, "html.parser") css_pages = list() for css in soup.find_all("link"): @@ -118,31 +110,27 @@ class FandomWiki: css_pages.append(css_url) for page in css_pages: - response = requests.get(page) - response.raise_for_status() + with urlopen(page) as response: + response_body = response.read() + decoded_body = response_body.decode("utf-8") css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}") - with open(css_filename, "wb") as outfile: - outfile.write(response.content) + with open(css_filename, "w") as outfile: + outfile.write(decoded_body) console.log(css_filename) def save_img(self, img_url: str): filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name) if not filename.exists(): - response = requests.get(img_url, stream=True) - response.raise_for_status() - - with open(filename, "wb") as outfile: - for chunk in response.iter_content(chunk_size=8192): - outfile.write(chunk) - + urlretrieve(img_url, filename) console.log(filename) def fetch_all_images(self, page_url: str): - response = requests.get(page_url) - response.raise_for_status() - soup = BeautifulSoup(response.content, "html.parser") + with urlopen(page_url) as response: + response_body = response.read() + decoded_body = response_body.decode("utf-8") + soup = BeautifulSoup(decoded_body, "html.parser") img_tags = soup.find_all("img") img_urls = [img["src"] for img in img_tags if "src" in img.attrs] @@ -156,9 +144,10 @@ class FandomWiki: def save_page(self, url: str): filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html") if not filename.exists(): - response = requests.get(url) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") + with urlopen(url) as response: + response_body = response.read() + decoded_body = response_body.decode("utf-8") + soup = BeautifulSoup(decoded_body, "html.parser") stylesheet_count = 0 for link in soup.find_all("link", {"rel": "stylesheet"}): @@ -196,72 +185,60 @@ class FandomWiki: with concurrent.futures.ThreadPoolExecutor() as executor: executor.map(self.save_page, hop1_urls) - def archive(self): + def archive_site(self): + try: + with urlopen(self.canonical_url) as response: + response.read() + except HTTPError as http_err: + console.print("Oops. Something went wrong. Likely one of the following:\n") + console.print("(1) The wiki you requested does not exist.") + console.print("(2) You mistyped the name of the wiki.") + console.print("(3) The server hosting that wiki is down for some reason.\n") + console.print(f"HTTP Error: {http_err}") + except URLError as url_err: + console.print(f"URL Error: {url_err}") + + with console.status("Fetching hop 0 URLs...", spinner="aesthetic"): + hop0_urls = self.get_hop0_urls() + + with console.status("Fetching hop 1 URLs...", spinner="aesthetic"): + hop1_urls = self.get_hop1_urls(hop0_urls) + + self.site_dir.mkdir() + self.images_dir.mkdir() + + with console.status("Saving CSS files...", spinner="aesthetic"): + self.save_css() + + with console.status( + "Downloading web pages and/or images...", spinner="aesthetic" + ): + self.fetch_all_pages(hop1_urls) + + total_web_files = sum(1 for x in self.site_dir.iterdir() if x.is_file()) + total_image_files = sum(1 for x in self.images_dir.iterdir() if x.is_file()) + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - img_archive_filename = f"{self.images_dir}-{timestamp}.tar.xz" - num_of_imgs = sum(1 for img in self.images_dir.iterdir() if img.is_file()) + with console.status("Archiving images...", spinner="aesthetic"): + shutil.make_archive(f"images-{timestamp}", "xztar", self.images_dir) + shutil.rmtree(self.images_dir) + shutil.move(f"images-{timestamp}.tar.xz", self.site_dir) - with Progress() as progress: - task = progress.add_task("[cyan]Archiving images...", total=num_of_imgs) + with console.status("Archiving web files...", spinner="aesthetic"): + shutil.make_archive(f"{self.name}-{timestamp}", "gztar", self.site_dir) - with tarfile.open(img_archive_filename, "w:xz") as tar: - tar.add(self.images_dir) - progress.update(task, advance=1) + shutil.rmtree(self.site_dir) - progress.stop() - - shutil.rmtree(self.images_dir, ignore_errors=True) - - web_files = [ - f - for f in self.site_dir.iterdir() - if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir() - ] - - web_archive_filename = f"{self.site_dir}-{timestamp}.tar.gz" - - with Progress() as progress: - task = progress.add_task( - "[cyan]Archiving web files...", total=len(web_files) - ) - - with tarfile.open(web_archive_filename, "w:gz") as tar: - for web_file in web_files: - if progress.finished: - break - tar.add(web_file, arcname=web_file) - progress.update(task, advance=1) - - progress.stop() - - shutil.rmtree(self.site_dir, ignore_errors=True) - - console.log(f"\nTotal web files scraped: {len(web_files)}") - console.log(f"Total images scraped: {num_of_imgs}") - - -def archive_site(name: str, breezewiki_url: str = "https://breezewiki.nirn.quest"): - site = FandomWiki(name, breezewiki_url) - - with console.status("Fetching hop 0 URLs...", spinner="aesthetic"): - hop0_urls = site.get_hop0_urls() - - with console.status("Fetching hop 1 URLs...", spinner="aesthetic"): - hop1_urls = site.get_hop1_urls(hop0_urls) - - with console.status("Saving CSS files...", spinner="aesthetic"): - site.save_css() - - with console.status("Downloading images and web pages...", spinner="aesthetic"): - site.fetch_all_pages(hop1_urls) - - site.archive() + console.log(f"\nTotal web files archived: {total_web_files}") + console.log(f"Total images archived: {total_image_files}") if __name__ == "__main__": args = docopt(__doc__, options_first=True, help=True, version="1.0.1") + + site = FandomWiki(args[""]) if args[""]: - archive_site(args[""], args[""]) - else: - archive_site(args[""]) + site.set_breezewiki_instance(args[""]) + + site.archive_site() diff --git a/requirements.txt b/requirements.txt index 82cdce8..bab64ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,108 +1,3 @@ -beautifulsoup4==4.12.2 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da \ - --hash=sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a -bs4==0.0.1 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a -certifi==2023.7.22 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082 \ - --hash=sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9 -charset-normalizer==3.2.0 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96 \ - --hash=sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c \ - --hash=sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710 \ - --hash=sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706 \ - --hash=sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020 \ - --hash=sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252 \ - --hash=sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad \ - --hash=sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329 \ - --hash=sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a \ - --hash=sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f \ - --hash=sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6 \ - --hash=sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4 \ - --hash=sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a \ - --hash=sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46 \ - --hash=sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2 \ - --hash=sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23 \ - --hash=sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace \ - --hash=sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd \ - --hash=sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982 \ - --hash=sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10 \ - --hash=sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2 \ - --hash=sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea \ - --hash=sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09 \ - --hash=sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5 \ - --hash=sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149 \ - --hash=sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489 \ - --hash=sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9 \ - --hash=sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80 \ - --hash=sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592 \ - --hash=sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3 \ - --hash=sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6 \ - --hash=sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed \ - --hash=sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c \ - --hash=sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200 \ - --hash=sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a \ - --hash=sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e \ - --hash=sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d \ - --hash=sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6 \ - --hash=sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623 \ - --hash=sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669 \ - --hash=sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3 \ - --hash=sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa \ - --hash=sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9 \ - --hash=sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2 \ - --hash=sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f \ - --hash=sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1 \ - --hash=sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4 \ - --hash=sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a \ - --hash=sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8 \ - --hash=sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3 \ - --hash=sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029 \ - --hash=sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f \ - --hash=sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959 \ - --hash=sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22 \ - --hash=sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7 \ - --hash=sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952 \ - --hash=sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346 \ - --hash=sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e \ - --hash=sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d \ - --hash=sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299 \ - --hash=sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd \ - --hash=sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a \ - --hash=sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3 \ - --hash=sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037 \ - --hash=sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94 \ - --hash=sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c \ - --hash=sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858 \ - --hash=sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a \ - --hash=sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449 \ - --hash=sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c \ - --hash=sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918 \ - --hash=sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1 \ - --hash=sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c \ - --hash=sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac \ - --hash=sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa -idna==3.4 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \ - --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2 -markdown-it-py==3.0.0 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ - --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb -mdurl==0.1.2 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ - --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba -pygments==2.16.1 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692 \ - --hash=sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29 -requests==2.31.0 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \ - --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1 -rich==13.5.2 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:146a90b3b6b47cac4a73c12866a499e9817426423f57c5a66949c086191a8808 \ - --hash=sha256:fb9d6c0a0f643c99eed3875b5377a184132ba9be4d61516a55273d3554d75a39 -soupsieve==2.5 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \ - --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7 -urllib3==2.0.4 ; python_version >= "3.11" and python_version < "4.0" \ - --hash=sha256:8d22f86aae8ef5e410d4f539fde9ce6b2113a001bb4d189e0aed70642d602b11 \ - --hash=sha256:de7df1803967d2c2a98e4b11bb7d6bd9210474c46e8a0401514e3a42a75ebde4 +beautifulsoup4==4.12.3 +docopt==0.6.2 +rich==13.7.1