mirror of
https://codeberg.org/hyperreal/bin
synced 2024-11-01 16:43:08 +01:00
Add get-fandom-wiki-urls
This commit is contained in:
parent
7d4037cd48
commit
06c3246989
103
get-fandom-wiki-urls
Executable file
103
get-fandom-wiki-urls
Executable file
@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# This little script makes it easy to Archive All The Things.
|
||||
# It prints a list of URLs from fandom wiki pages to stdout.
|
||||
#
|
||||
# Stdout can then be redirected to a plaintext file with e.g:
|
||||
# `get_fandom_wiki_urls cyberpunk > ~/downloads/cyberpunk_wiki_urls.txt`
|
||||
#
|
||||
# These URLs can then be imported directly into ArchiveBox.
|
||||
#
|
||||
# LICENSE
|
||||
# This is free and unencumbered software released into the public domain.
|
||||
# Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
# distribute this software, either in source code form or as a compiled
|
||||
# binary, for any purpose, commercial or non-commercial, and by any
|
||||
# means.
|
||||
|
||||
# In jurisdictions that recognize copyright laws, the author or authors
|
||||
# of this software dedicate any and all copyright interest in the
|
||||
# software to the public domain. We make this dedication for the benefit
|
||||
# of the public at large and to the detriment of our heirs and
|
||||
# successors. We intend this dedication to be an overt act of
|
||||
# relinquishment in perpetuity of all present and future rights to this
|
||||
# software under copyright law.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED \AS IS\, WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
# OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
# For more information, please refer to <https://unlicense.org>
|
||||
|
||||
import sys
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def get_urls(fandom: str) -> list():
|
||||
starting_url = "https://" + fandom + ".fandom.com/wiki/Local_Sitemap"
|
||||
urls = [starting_url]
|
||||
|
||||
while True:
|
||||
reqs = requests.get(starting_url)
|
||||
soup = BeautifulSoup(reqs.text, "html.parser")
|
||||
mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]
|
||||
|
||||
if (
|
||||
len(mw_allpages_nav.find_all("a")) < 2
|
||||
and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
|
||||
):
|
||||
break
|
||||
else:
|
||||
if len(mw_allpages_nav.find_all("a")) < 2:
|
||||
starting_url = (
|
||||
"https://"
|
||||
+ fandom
|
||||
+ ".fandom.com"
|
||||
+ mw_allpages_nav.find_all("a")[0].get("href")
|
||||
)
|
||||
else:
|
||||
starting_url = (
|
||||
"https://"
|
||||
+ fandom
|
||||
+ ".fandom.com"
|
||||
+ mw_allpages_nav.find_all("a")[1].get("href")
|
||||
)
|
||||
|
||||
urls.append(starting_url)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
match sys.argv[1]:
|
||||
case "cyberpunk":
|
||||
urls = get_urls("cyberpunk")
|
||||
case "dishonored":
|
||||
urls = get_urls("dishonored")
|
||||
case "dragonage":
|
||||
urls = get_urls("dragonage")
|
||||
case "masseffect":
|
||||
urls = get_urls("masseffect")
|
||||
case "residentevil":
|
||||
urls = get_urls("residentevil")
|
||||
case _:
|
||||
print("Enter a fandom wiki to scrape URLs from.")
|
||||
|
||||
for url in urls:
|
||||
print(url)
|
||||
else:
|
||||
print("Please supply a fandom wiki name as arg1.")
|
||||
print("Supported wikis:")
|
||||
print("- cyberpunk")
|
||||
print("- dishonored")
|
||||
print("- dragonage")
|
||||
print("- masseffect")
|
||||
print("- residentevil")
|
Loading…
Reference in New Issue
Block a user