2023-09-01 23:07:07 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
#
|
|
|
|
# This little script makes it easy to Archive All The Things.
|
|
|
|
# It prints a list of URLs from fandom wiki pages to stdout.
|
|
|
|
#
|
|
|
|
# Stdout can then be redirected to a plaintext file with e.g:
|
2023-09-03 21:47:33 +02:00
|
|
|
# `get-fandom-wiki-urls cyberpunk > ~/downloads/cyberpunk-wiki-urls.txt`
|
2023-09-01 23:07:07 +02:00
|
|
|
#
|
2023-09-03 21:47:33 +02:00
|
|
|
# These URLs can then be imported directly into ArchiveBox. Each URL will
|
|
|
|
# be a page of the local sitemap. The local sitemap is a list of wiki pages
|
|
|
|
# in alphabetical order. Importing the URLs scraped by the script into
|
|
|
|
# ArchiveBox with a depth of '1' will pull every URL one hop away, so every
|
|
|
|
# wiki page listed in the local sitemap will be archived.
|
|
|
|
#
|
|
|
|
# This script wouldn't be necessary if there was a way to view the entire
|
|
|
|
# local sitemap in one html page. Then all you'd have to do is import the
|
|
|
|
# URL for the local sitemap into ArchiveBox with a depth of '1'. As far I
|
|
|
|
# know there is no way to get this view of the local sitemap. For some
|
|
|
|
# unknown reason the Wiki fandom site developers didn't design the frontend
|
|
|
|
# to enable that.
|
2023-09-01 23:07:07 +02:00
|
|
|
#
|
|
|
|
# LICENSE
|
|
|
|
# This is free and unencumbered software released into the public domain.
|
|
|
|
# Anyone is free to copy, modify, publish, use, compile, sell, or
|
|
|
|
# distribute this software, either in source code form or as a compiled
|
|
|
|
# binary, for any purpose, commercial or non-commercial, and by any
|
|
|
|
# means.
|
|
|
|
|
|
|
|
# In jurisdictions that recognize copyright laws, the author or authors
|
|
|
|
# of this software dedicate any and all copyright interest in the
|
|
|
|
# software to the public domain. We make this dedication for the benefit
|
|
|
|
# of the public at large and to the detriment of our heirs and
|
|
|
|
# successors. We intend this dedication to be an overt act of
|
|
|
|
# relinquishment in perpetuity of all present and future rights to this
|
|
|
|
# software under copyright law.
|
|
|
|
|
|
|
|
# THE SOFTWARE IS PROVIDED \AS IS\, WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
|
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
|
|
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
|
|
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
|
|
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
|
|
# OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
|
|
|
# For more information, please refer to <https://unlicense.org>
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
2023-09-08 14:33:56 +02:00
|
|
|
def get_hop0_urls(fandom: str) -> list():
|
2023-09-01 23:07:07 +02:00
|
|
|
starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap"
|
2023-09-08 14:33:56 +02:00
|
|
|
hop0_urls = [starting_url]
|
2023-09-01 23:07:07 +02:00
|
|
|
|
|
|
|
while True:
|
|
|
|
reqs = requests.get(starting_url)
|
|
|
|
soup = BeautifulSoup(reqs.text, "html.parser")
|
|
|
|
mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]
|
|
|
|
|
|
|
|
if (
|
|
|
|
len(mw_allpages_nav.find_all("a")) < 2
|
|
|
|
and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
|
|
|
|
):
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
if len(mw_allpages_nav.find_all("a")) < 2:
|
|
|
|
starting_url = (
|
|
|
|
"https://"
|
|
|
|
+ fandom
|
|
|
|
+ ".fandom.com"
|
|
|
|
+ mw_allpages_nav.find_all("a")[0].get("href")
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
starting_url = (
|
|
|
|
"https://"
|
|
|
|
+ fandom
|
|
|
|
+ ".fandom.com"
|
|
|
|
+ mw_allpages_nav.find_all("a")[1].get("href")
|
|
|
|
)
|
|
|
|
|
2023-09-08 14:33:56 +02:00
|
|
|
hop0_urls.append(starting_url)
|
2023-09-01 23:07:07 +02:00
|
|
|
|
2023-09-08 14:33:56 +02:00
|
|
|
return hop0_urls
|
|
|
|
|
|
|
|
|
|
|
|
def get_hop1_urls(hop0_urls: list) -> list():
|
|
|
|
hop1_urls = list()
|
|
|
|
|
|
|
|
for url in hop0_urls:
|
|
|
|
reqs = requests.get(url)
|
|
|
|
soup = BeautifulSoup(reqs.text, "html.parser")
|
|
|
|
fandom = url.split(sep="/wiki")[0]
|
|
|
|
|
|
|
|
for item in soup.find_all("a"):
|
|
|
|
if item.get("href") and item.get("href").startswith("/wiki"):
|
|
|
|
hop1_urls.append(fandom + item.get("href"))
|
|
|
|
|
|
|
|
return hop1_urls
|
2023-09-01 23:07:07 +02:00
|
|
|
|
|
|
|
|
2023-09-03 07:01:18 +02:00
|
|
|
def help_message():
|
|
|
|
supported_wikis = ["cyberpunk", "dishonored", "dragonage", "forgottenrealms", "masseffect", "residentevil"]
|
|
|
|
print("Supply a fandom wiki name as arg1.\n")
|
|
|
|
print("Currently supported wikis:")
|
|
|
|
for wiki in supported_wikis:
|
|
|
|
print("- %s" % wiki)
|
|
|
|
|
|
|
|
|
2023-09-01 23:07:07 +02:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
match sys.argv[1]:
|
|
|
|
case "cyberpunk":
|
2023-09-08 14:33:56 +02:00
|
|
|
urls = get_hop1_urls(get_hop0_urls("cyberpunk"))
|
2023-09-01 23:07:07 +02:00
|
|
|
case "dishonored":
|
2023-09-08 14:33:56 +02:00
|
|
|
urls = get_hop1_urls(get_hop0_urls("dishonored"))
|
2023-09-01 23:07:07 +02:00
|
|
|
case "dragonage":
|
2023-09-08 14:33:56 +02:00
|
|
|
urls = get_hop1_urls(get_hop0_urls("dragonage"))
|
2023-09-03 07:01:18 +02:00
|
|
|
case "forgottenrealms":
|
2023-09-08 14:33:56 +02:00
|
|
|
urls = get_hop1_urls(get_hop0_urls("forgottenrealms"))
|
2023-09-01 23:07:07 +02:00
|
|
|
case "masseffect":
|
2023-09-08 14:33:56 +02:00
|
|
|
urls = get_hop1_urls(get_hop0_urls("masseffect"))
|
2023-09-01 23:07:07 +02:00
|
|
|
case "residentevil":
|
2023-09-08 14:33:56 +02:00
|
|
|
urls = get_hop1_urls(get_hop0_urls("residentevil"))
|
2023-09-01 23:07:07 +02:00
|
|
|
case _:
|
2023-09-03 07:01:18 +02:00
|
|
|
help_message()
|
2023-09-01 23:07:07 +02:00
|
|
|
|
|
|
|
for url in urls:
|
|
|
|
print(url)
|
|
|
|
else:
|
2023-09-03 07:01:18 +02:00
|
|
|
help_message()
|