bin/get-fandom-wiki-urls

#!/usr/bin/env python3
#
# This little script makes it easy to Archive All The Things.
# It prints a list of URLs from fandom wiki pages to stdout.
#
# Stdout can then be redirected to a plaintext file with e.g:
# `get_fandom_wiki_urls cyberpunk > ~/downloads/cyberpunk_wiki_urls.txt`
#
# These URLs can then be imported directly into ArchiveBox.
#
# LICENSE
# This is free and unencumbered software released into the public domain.
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.

# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.

# THE SOFTWARE IS PROVIDED \AS IS\, WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.

# For more information, please refer to <https://unlicense.org>

import sys

import requests
from bs4 import BeautifulSoup


def get_urls(fandom: str) -> list():
    starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap"
    urls = [starting_url]

    while True:
        reqs = requests.get(starting_url)
        soup = BeautifulSoup(reqs.text, "html.parser")
        mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]

        if (
            len(mw_allpages_nav.find_all("a")) < 2
            and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
        ):
            break
        else:
            if len(mw_allpages_nav.find_all("a")) < 2:
                starting_url = (
                    "https://"
                    + fandom
                    + ".fandom.com"
                    + mw_allpages_nav.find_all("a")[0].get("href")
                )
            else:
                starting_url = (
                    "https://"
                    + fandom
                    + ".fandom.com"
                    + mw_allpages_nav.find_all("a")[1].get("href")
                )

            urls.append(starting_url)

    return urls


if __name__ == "__main__":

    if len(sys.argv) > 1:
        match sys.argv[1]:
            case "cyberpunk":
                urls = get_urls("cyberpunk")
            case "dishonored":
                urls = get_urls("dishonored")
            case "dragonage":
                urls = get_urls("dragonage")
            case "masseffect":
                urls = get_urls("masseffect")
            case "residentevil":
                urls = get_urls("residentevil")
            case _:
                print("Enter a fandom wiki to scrape URLs from.")

        for url in urls:
            print(url)
    else:
        print("Please supply a fandom wiki name as arg1.")
        print("Supported wikis:")
        print("- cyberpunk")
        print("- dishonored")
        print("- dragonage")
        print("- masseffect")
        print("- residentevil")