From 06c32469893239985de910c6b572df1531d2a84f Mon Sep 17 00:00:00 2001 From: Jeffrey Serio <23226432+hyperreal64@users.noreply.github.com> Date: Fri, 1 Sep 2023 16:07:07 -0500 Subject: [PATCH] Add get-fandom-wiki-urls --- get-fandom-wiki-urls | 103 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100755 get-fandom-wiki-urls diff --git a/get-fandom-wiki-urls b/get-fandom-wiki-urls new file mode 100755 index 0000000..29fdc6e --- /dev/null +++ b/get-fandom-wiki-urls @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# +# This little script makes it easy to Archive All The Things. +# It prints a list of URLs from fandom wiki pages to stdout. +# +# Stdout can then be redirected to a plaintext file with e.g: +# `get_fandom_wiki_urls cyberpunk > ~/downloads/cyberpunk_wiki_urls.txt` +# +# These URLs can then be imported directly into ArchiveBox. +# +# LICENSE +# This is free and unencumbered software released into the public domain. +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. + +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. + +# THE SOFTWARE IS PROVIDED \AS IS\, WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +# For more information, please refer to + +import sys + +import requests +from bs4 import BeautifulSoup + + +def get_urls(fandom: str) -> list(): + starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap" + urls = [starting_url] + + while True: + reqs = requests.get(starting_url) + soup = BeautifulSoup(reqs.text, "html.parser") + mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0] + + if ( + len(mw_allpages_nav.find_all("a")) < 2 + and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text() + ): + break + else: + if len(mw_allpages_nav.find_all("a")) < 2: + starting_url = ( + "https://" + + fandom + + ".fandom.com" + + mw_allpages_nav.find_all("a")[0].get("href") + ) + else: + starting_url = ( + "https://" + + fandom + + ".fandom.com" + + mw_allpages_nav.find_all("a")[1].get("href") + ) + + urls.append(starting_url) + + return urls + + +if __name__ == "__main__": + + if len(sys.argv) > 1: + match sys.argv[1]: + case "cyberpunk": + urls = get_urls("cyberpunk") + case "dishonored": + urls = get_urls("dishonored") + case "dragonage": + urls = get_urls("dragonage") + case "masseffect": + urls = get_urls("masseffect") + case "residentevil": + urls = get_urls("residentevil") + case _: + print("Enter a fandom wiki to scrape URLs from.") + + for url in urls: + print(url) + else: + print("Please supply a fandom wiki name as arg1.") + print("Supported wikis:") + print("- cyberpunk") + print("- dishonored") + print("- dragonage") + print("- masseffect") + print("- residentevil")