Fix: allow to specify BreezeWiki instance URL

This commit is contained in:
Jeffrey Serio 2024-04-25 16:37:18 -05:00
parent 14d0f4c725
commit 1a06ff4632
4 changed files with 31 additions and 57 deletions

View File

@ -1,40 +0,0 @@
# archive-fandom-wiki
This program archives the content of fandom wikis. It doesn't scrape from the fandom.com wiki sites directly; rather, it uses my [BreezeWiki](https://breezewiki.hyperreal.coffee) instance to avoid downloading unnecessary ads, images, and other junk.
Each resulting archive is self-contained, meaning one can extract the contents and browse the wiki snapshot locally (offline). The URLs for CSS, images, and links in each page are replaced by the relative `file:///` URLs for their corresponding pages on the local filesystem.
## Installation
Make sure Python and Pip are installed. Then run:
``` bash
git clone https://git.sr.ht/~hyperreal/archive-fandom-wiki
cd archive-fandom-wiki
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```
## Usage
``` bash
archive-fandom-wiki dishonored
```
## Podman/Docker
There is also a Containerfile, also known as a Dockerfile.
``` bash
git clone https://git.sr.ht/~hyperreal/archive-fandom-wiki
cd archive-fandom-wiki
podman build -t localhost/archive-fandom-wiki:latest .
```
To run the container image:
``` bash
podman run --name archive-fandom-wiki --rm -v "${HOME}/archives:/output:Z" localhost/archive-fandom-wiki dishonored
```

View File

@ -8,7 +8,7 @@ Each resulting archive is self-contained, meaning one can extract the contents a
Make sure Python and Pip are installed. Then run: Make sure Python and Pip are installed. Then run:
#+begin_src bash #+begin_src bash
git clone https://git.sr.ht/~hyperreal/archive-fandom-wiki.git git clone https://codeberg.org/hyperreal/archive-fandom-wiki.git
cd archive-fandom-wiki cd archive-fandom-wiki
python -m venv venv python -m venv venv
source venv/bin/activate source venv/bin/activate
@ -16,14 +16,16 @@ pip install -r requirements.txt
#+end_src #+end_src
** Usage ** Usage
One may specify the BreezeWiki instance URL, or the default value (my BreezeWiki instance URL) will be used.
#+begin_src bash #+begin_src bash
archive-fandom-wiki dishonored afw dishonored https://breezewiki.instance.url
afw dishonored
#+end_src #+end_src
** Podman/Docker ** Podman/Docker
There is also a Containerfile, also known as a Dockerfile. There is also a Containerfile, also known as a Dockerfile.
#+begin_src bash #+begin_src bash
git clone https://git.sr.ht/~hyperreal/archive-fandom-wiki git clone https://codeberg.org/hyperreal/archive-fandom-wiki
cd archive-fandom-wiki cd archive-fandom-wiki
podman build -t localhost/archive-fandom-wiki:latest . podman build -t localhost/archive-fandom-wiki:latest .
#+end_src #+end_src

View File

@ -1,11 +1,28 @@
#!/usr/bin/env python #!/usr/bin/env python
"""archive-fandom-wiki
Usage:
afw <fandom> <breezewiki_instance>
afw <fandom>
Options:
-h --help Show this help message.
-v --version Show version.
Examples:
afw dishonored https://breezewiki.nirn.quest
afw residentevil
"""
# This file is formatted with `black -l 79' to comply with PEP8 standards. # This file is formatted with `black -l 79' to comply with PEP8 standards.
import concurrent.futures import concurrent.futures
import shutil import shutil
import sys import sys
from docopt import docopt
sys.tracebacklimit = 0 sys.tracebacklimit = 0
import tarfile import tarfile
from datetime import datetime from datetime import datetime
@ -21,10 +38,10 @@ console = Console()
class FandomWiki: class FandomWiki:
def __init__(self, name: str): def __init__(self, name: str, breezewiki_url: str):
self.name = name self.name = name
self.canonical_url = f"https://{name}.fandom.com" self.canonical_url = f"https://{name}.fandom.com"
self.breezewiki_url = f"https://breezewiki.hyperreal.coffee/{name}" self.breezewiki_url = breezewiki_url
self.site_dir = Path.cwd().joinpath(f"{name}.fandom.com") self.site_dir = Path.cwd().joinpath(f"{name}.fandom.com")
self.images_dir = self.site_dir.joinpath("images") self.images_dir = self.site_dir.joinpath("images")
@ -234,8 +251,8 @@ class FandomWiki:
console.log(f"Total images scraped: {len(img_files)}") console.log(f"Total images scraped: {len(img_files)}")
def archive_site(name: str): def archive_site(name: str, breezewiki_url: str = "https://breezewiki.nirn.quest"):
site = FandomWiki(name) site = FandomWiki(name, breezewiki_url)
with console.status("Fetching hop 0 URLs...", spinner="aesthetic"): with console.status("Fetching hop 0 URLs...", spinner="aesthetic"):
hop0_urls = site.get_hop0_urls() hop0_urls = site.get_hop0_urls()
@ -252,14 +269,9 @@ def archive_site(name: str):
site.archive() site.archive()
def usage_message():
console.print("Usage:\n\tarchive-fandom-wiki [[italic]name[/italic]]\n")
console.print("Example:\n\tarchive-fandom-wiki dishonored\n")
console.print("All wikis on fandom.com are supported.")
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) > 1: args = docopt(__doc__, options_first=True, help=True, version="1.0.1")
archive_site(sys.argv[1]) if args["<breezewiki_instance>"]:
archive_site(args["<fandom>"], args["<breezewiki_instance>"])
else: else:
usage_message() archive_site(args["<fandom>"])

View File

@ -4,4 +4,4 @@ set -e
. /venv/bin/activate . /venv/bin/activate
exec /archive-fandom-wiki "$@" exec /afw "$@"