#!/usr/bin/env python3 # # This little script makes it easy to Archive All The Things. # It prints a list of URLs from fandom wiki pages to stdout. # # Stdout can then be redirected to a plaintext file with e.g: # `get-fandom-wiki-urls cyberpunk > ~/downloads/cyberpunk-wiki-urls.txt` # # These URLs can then be imported directly into ArchiveBox. Each URL will # be a page of the local sitemap. The local sitemap is a list of wiki pages # in alphabetical order. Importing the URLs scraped by the script into # ArchiveBox with a depth of '1' will pull every URL one hop away, so every # wiki page listed in the local sitemap will be archived. # # This script wouldn't be necessary if there was a way to view the entire # local sitemap in one html page. Then all you'd have to do is import the # URL for the local sitemap into ArchiveBox with a depth of '1'. As far I # know there is no way to get this view of the local sitemap. For some # unknown reason the Wiki fandom site developers didn't design the frontend # to enable that. # # LICENSE # This is free and unencumbered software released into the public domain. # Anyone is free to copy, modify, publish, use, compile, sell, or # distribute this software, either in source code form or as a compiled # binary, for any purpose, commercial or non-commercial, and by any # means. # In jurisdictions that recognize copyright laws, the author or authors # of this software dedicate any and all copyright interest in the # software to the public domain. We make this dedication for the benefit # of the public at large and to the detriment of our heirs and # successors. We intend this dedication to be an overt act of # relinquishment in perpetuity of all present and future rights to this # software under copyright law. # THE SOFTWARE IS PROVIDED \AS IS\, WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # For more information, please refer to import sys import requests from bs4 import BeautifulSoup def get_hop0_urls(fandom: str) -> list(): starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap" hop0_urls = [starting_url] while True: reqs = requests.get(starting_url) soup = BeautifulSoup(reqs.text, "html.parser") mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0] if ( len(mw_allpages_nav.find_all("a")) < 2 and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text() ): break else: if len(mw_allpages_nav.find_all("a")) < 2: starting_url = ( "https://" + fandom + ".fandom.com" + mw_allpages_nav.find_all("a")[0].get("href") ) else: starting_url = ( "https://" + fandom + ".fandom.com" + mw_allpages_nav.find_all("a")[1].get("href") ) hop0_urls.append(starting_url) return hop0_urls def get_hop1_urls(hop0_urls: list) -> list(): hop1_urls = list() for url in hop0_urls: reqs = requests.get(url) soup = BeautifulSoup(reqs.text, "html.parser") fandom = url.split(sep="/wiki")[0] for item in soup.find_all("a"): if item.get("href") and item.get("href").startswith("/wiki"): hop1_urls.append(fandom + item.get("href")) return hop1_urls def help_message(): supported_wikis = ["cyberpunk", "dishonored", "dragonage", "forgottenrealms", "masseffect", "residentevil"] print("Supply a fandom wiki name as arg1.\n") print("Currently supported wikis:") for wiki in supported_wikis: print("- %s" % wiki) if __name__ == "__main__": if len(sys.argv) > 1: match sys.argv[1]: case "cyberpunk": urls = get_hop1_urls(get_hop0_urls("cyberpunk")) case "dishonored": urls = get_hop1_urls(get_hop0_urls("dishonored")) case "dragonage": urls = get_hop1_urls(get_hop0_urls("dragonage")) case "forgottenrealms": urls = get_hop1_urls(get_hop0_urls("forgottenrealms")) case "masseffect": urls = get_hop1_urls(get_hop0_urls("masseffect")) case "residentevil": urls = get_hop1_urls(get_hop0_urls("residentevil")) case _: help_message() for url in urls: print(url) else: help_message()