bin/get-fandom-wiki-urls

#!/usr/bin/env python3
#
# This little script makes it easy to Archive All The Things.
# It prints a list of URLs from fandom wiki pages to stdout.
#
# Stdout can then be redirected to a plaintext file with e.g:
# `get-fandom-wiki-urls cyberpunk > ~/downloads/cyberpunk-wiki-urls.txt`
#
# These URLs can then be imported directly into ArchiveBox. Each URL will
# be a page of the local sitemap. The local sitemap is a list of wiki pages
# in alphabetical order. Importing the URLs scraped by the script into
# ArchiveBox with a depth of '1' will pull every URL one hop away, so every
# wiki page listed in the local sitemap will be archived.
#
# This script wouldn't be necessary if there was a way to view the entire
# local sitemap in one html page. Then all you'd have to do is import the
# URL for the local sitemap into ArchiveBox with a depth of '1'. As far I
# know there is no way to get this view of the local sitemap. For some
# unknown reason the Wiki fandom site developers didn't design the frontend
# to enable that.
#
# LICENSE
# This is free and unencumbered software released into the public domain.
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.

# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.

# THE SOFTWARE IS PROVIDED \AS IS\, WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.

# For more information, please refer to <https://unlicense.org>

import sys

import requests
from bs4 import BeautifulSoup


def get_hop0_urls(fandom: str) -> list():
    starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap"
    hop0_urls = [starting_url]

    while True:
        reqs = requests.get(starting_url)
        soup = BeautifulSoup(reqs.text, "html.parser")
        mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]

        if (
            len(mw_allpages_nav.find_all("a")) < 2
            and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
        ):
            break
        else:
            if len(mw_allpages_nav.find_all("a")) < 2:
                starting_url = (
                    "https://"
                    + fandom
                    + ".fandom.com"
                    + mw_allpages_nav.find_all("a")[0].get("href")
                )
            else:
                starting_url = (
                    "https://"
                    + fandom
                    + ".fandom.com"
                    + mw_allpages_nav.find_all("a")[1].get("href")
                )

            hop0_urls.append(starting_url)

    return hop0_urls


def get_hop1_urls(hop0_urls: list) -> list():
    hop1_urls = list()

    for url in hop0_urls:
        reqs = requests.get(url)
        soup = BeautifulSoup(reqs.text, "html.parser")
        fandom = url.split(sep="/wiki")[0]

        for item in soup.find_all("a"):
            if item.get("href") and item.get("href").startswith("/wiki"):
                hop1_urls.append(fandom + item.get("href"))

    return hop1_urls


def help_message():
    supported_wikis = ["cyberpunk", "dishonored", "dragonage", "forgottenrealms", "masseffect", "residentevil"]
    print("Supply a fandom wiki name as arg1.\n")
    print("Currently supported wikis:")
    for wiki in supported_wikis:
        print("- %s" % wiki)


if __name__ == "__main__":

    if len(sys.argv) > 1:
        match sys.argv[1]:
            case "cyberpunk":
                urls = get_hop1_urls(get_hop0_urls("cyberpunk"))
            case "dishonored":
                urls = get_hop1_urls(get_hop0_urls("dishonored"))
            case "dragonage":
                urls = get_hop1_urls(get_hop0_urls("dragonage"))
            case "forgottenrealms":
                urls = get_hop1_urls(get_hop0_urls("forgottenrealms"))
            case "masseffect":
                urls = get_hop1_urls(get_hop0_urls("masseffect"))
            case "residentevil":
                urls = get_hop1_urls(get_hop0_urls("residentevil"))
            case _:
                help_message()

        for url in urls:
            print(url)
    else:
        help_message()
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`#!/usr/bin/env python3`
			`#`
			`# This little script makes it easy to Archive All The Things.`
			`# It prints a list of URLs from fandom wiki pages to stdout.`
			`#`
			`# Stdout can then be redirected to a plaintext file with e.g:`
Add description of script 2023-09-03 21:47:33 +02:00			# `get-fandom-wiki-urls cyberpunk > ~/downloads/cyberpunk-wiki-urls.txt`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`#`
Add description of script 2023-09-03 21:47:33 +02:00			`# These URLs can then be imported directly into ArchiveBox. Each URL will`
			`# be a page of the local sitemap. The local sitemap is a list of wiki pages`
			`# in alphabetical order. Importing the URLs scraped by the script into`
			`# ArchiveBox with a depth of '1' will pull every URL one hop away, so every`
			`# wiki page listed in the local sitemap will be archived.`
			`#`
			`# This script wouldn't be necessary if there was a way to view the entire`
			`# local sitemap in one html page. Then all you'd have to do is import the`
			`# URL for the local sitemap into ArchiveBox with a depth of '1'. As far I`
			`# know there is no way to get this view of the local sitemap. For some`
			`# unknown reason the Wiki fandom site developers didn't design the frontend`
			`# to enable that.`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`#`
			`# LICENSE`
			`# This is free and unencumbered software released into the public domain.`
			`# Anyone is free to copy, modify, publish, use, compile, sell, or`
			`# distribute this software, either in source code form or as a compiled`
			`# binary, for any purpose, commercial or non-commercial, and by any`
			`# means.`

			`# In jurisdictions that recognize copyright laws, the author or authors`
			`# of this software dedicate any and all copyright interest in the`
			`# software to the public domain. We make this dedication for the benefit`
			`# of the public at large and to the detriment of our heirs and`
			`# successors. We intend this dedication to be an overt act of`
			`# relinquishment in perpetuity of all present and future rights to this`
			`# software under copyright law.`

			`# THE SOFTWARE IS PROVIDED \AS IS\, WITHOUT WARRANTY OF ANY KIND,`
			`# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.`
			`# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR`
			`# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,`
			`# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR`
			`# OTHER DEALINGS IN THE SOFTWARE.`

			`# For more information, please refer to <https://unlicense.org>`

			`import sys`

			`import requests`
			`from bs4 import BeautifulSoup`


Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`def get_hop0_urls(fandom: str) -> list():`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap"`
Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`hop0_urls = [starting_url]`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00
			`while True:`
			`reqs = requests.get(starting_url)`
			`soup = BeautifulSoup(reqs.text, "html.parser")`
			`mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]`

			`if (`
			`len(mw_allpages_nav.find_all("a")) < 2`
			`and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()`
			`):`
			`break`
			`else:`
			`if len(mw_allpages_nav.find_all("a")) < 2:`
			`starting_url = (`
			`"https://"`
			`+ fandom`
			`+ ".fandom.com"`
			`+ mw_allpages_nav.find_all("a")[0].get("href")`
			`)`
			`else:`
			`starting_url = (`
			`"https://"`
			`+ fandom`
			`+ ".fandom.com"`
			`+ mw_allpages_nav.find_all("a")[1].get("href")`
			`)`

Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`hop0_urls.append(starting_url)`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00
Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`return hop0_urls`


			`def get_hop1_urls(hop0_urls: list) -> list():`
			`hop1_urls = list()`

			`for url in hop0_urls:`
			`reqs = requests.get(url)`
			`soup = BeautifulSoup(reqs.text, "html.parser")`
			`fandom = url.split(sep="/wiki")[0]`

			`for item in soup.find_all("a"):`
			`if item.get("href") and item.get("href").startswith("/wiki"):`
			`hop1_urls.append(fandom + item.get("href"))`

			`return hop1_urls`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00

+x on gumstarred, update get-fandom-wiki-urls 2023-09-03 07:01:18 +02:00			`def help_message():`
			`supported_wikis = ["cyberpunk", "dishonored", "dragonage", "forgottenrealms", "masseffect", "residentevil"]`
			`print("Supply a fandom wiki name as arg1.\n")`
			`print("Currently supported wikis:")`
			`for wiki in supported_wikis:`
			`print("- %s" % wiki)`


Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`if __name__ == "__main__":`

			`if len(sys.argv) > 1:`
			`match sys.argv[1]:`
			`case "cyberpunk":`
Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`urls = get_hop1_urls(get_hop0_urls("cyberpunk"))`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`case "dishonored":`
Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`urls = get_hop1_urls(get_hop0_urls("dishonored"))`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`case "dragonage":`
Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`urls = get_hop1_urls(get_hop0_urls("dragonage"))`
+x on gumstarred, update get-fandom-wiki-urls 2023-09-03 07:01:18 +02:00			`case "forgottenrealms":`
Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`urls = get_hop1_urls(get_hop0_urls("forgottenrealms"))`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`case "masseffect":`
Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`urls = get_hop1_urls(get_hop0_urls("masseffect"))`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`case "residentevil":`
Get all URLs in fandom page 2023-09-08 14:33:56 +02:00			`urls = get_hop1_urls(get_hop0_urls("residentevil"))`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00			`case _:`
+x on gumstarred, update get-fandom-wiki-urls 2023-09-03 07:01:18 +02:00			`help_message()`
Add get-fandom-wiki-urls 2023-09-01 23:07:07 +02:00
			`for url in urls:`
			`print(url)`
			`else:`
+x on gumstarred, update get-fandom-wiki-urls 2023-09-03 07:01:18 +02:00			`help_message()`