#!/usr/bin/env python3 # # This little script makes it easy to Archive All The Things. # It prints a list of URLs from fandom wiki pages to stdout. # # Stdout can then be redirected to a plaintext file with e.g: # `get_fandom_wiki_urls cyberpunk > ~/downloads/cyberpunk_wiki_urls.txt` # # These URLs can then be imported directly into ArchiveBox. # # LICENSE # This is free and unencumbered software released into the public domain. # Anyone is free to copy, modify, publish, use, compile, sell, or # distribute this software, either in source code form or as a compiled # binary, for any purpose, commercial or non-commercial, and by any # means. # In jurisdictions that recognize copyright laws, the author or authors # of this software dedicate any and all copyright interest in the # software to the public domain. We make this dedication for the benefit # of the public at large and to the detriment of our heirs and # successors. We intend this dedication to be an overt act of # relinquishment in perpetuity of all present and future rights to this # software under copyright law. # THE SOFTWARE IS PROVIDED \AS IS\, WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # For more information, please refer to import sys import requests from bs4 import BeautifulSoup def get_urls(fandom: str) -> list(): starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap" urls = [starting_url] while True: reqs = requests.get(starting_url) soup = BeautifulSoup(reqs.text, "html.parser") mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0] if ( len(mw_allpages_nav.find_all("a")) < 2 and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text() ): break else: if len(mw_allpages_nav.find_all("a")) < 2: starting_url = ( "https://" + fandom + ".fandom.com" + mw_allpages_nav.find_all("a")[0].get("href") ) else: starting_url = ( "https://" + fandom + ".fandom.com" + mw_allpages_nav.find_all("a")[1].get("href") ) urls.append(starting_url) return urls if __name__ == "__main__": if len(sys.argv) > 1: match sys.argv[1]: case "cyberpunk": urls = get_urls("cyberpunk") case "dishonored": urls = get_urls("dishonored") case "dragonage": urls = get_urls("dragonage") case "masseffect": urls = get_urls("masseffect") case "residentevil": urls = get_urls("residentevil") case _: print("Enter a fandom wiki to scrape URLs from.") for url in urls: print(url) else: print("Please supply a fandom wiki name as arg1.") print("Supported wikis:") print("- cyberpunk") print("- dishonored") print("- dragonage") print("- masseffect") print("- residentevil")