feat: support all wikis on fandom.com

This commit is contained in:
Jeffrey Serio 2023-09-15 17:43:09 -05:00
parent 6794d04d32
commit aefa0db24b

View File

@ -33,6 +33,16 @@ class FandomWiki:
if not self.images_dir.exists(): if not self.images_dir.exists():
self.images_dir.mkdir() self.images_dir.mkdir()
try:
response = requests.get(self.canonical_url)
response.raise_for_status()
except requests.HTTPError as http_err:
console.print("Oops. Something went wrong. Likely one of the following:")
console.print("(1) The wiki you requested does not exist.")
console.print("(2) You typed the name of the wiki incorrectly.")
console.print("(3) The server hosting that wiki is down for some reason.")
console.print(f"HTTP error: {http_err}")
def get_hop0_urls(self) -> list: def get_hop0_urls(self) -> list:
starting_url = "{}/wiki/Local_Sitemap".format(self.canonical_url) starting_url = "{}/wiki/Local_Sitemap".format(self.canonical_url)
hop0_urls = list() hop0_urls = list()
@ -41,14 +51,11 @@ class FandomWiki:
response = requests.get(starting_url) response = requests.get(starting_url)
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
mw_allpages_nav = soup.find_all( mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]
"div", {"class": "mw-allpages-nav"}
)[0]
if ( if (
len(mw_allpages_nav.find_all("a")) < 2 len(mw_allpages_nav.find_all("a")) < 2
and "Next page" and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
not in mw_allpages_nav.find_all("a")[0].get_text()
): ):
break break
else: else:
@ -81,13 +88,9 @@ class FandomWiki:
if "Local_Sitemap" not in item.get( if "Local_Sitemap" not in item.get(
"href" "href"
) and "Special:" not in item.get("href"): ) and "Special:" not in item.get("href"):
new_url = "{}{}".format( new_url = "{}{}".format(self.breezewiki_url, item.get("href"))
self.breezewiki_url, item.get("href")
)
hop1_urls.append(new_url) hop1_urls.append(new_url)
console.print( console.print("[[bold]HOP 1[/bold]] {}".format(new_url))
"[[bold]HOP 1[/bold]] {}".format(new_url)
)
return hop1_urls return hop1_urls
@ -112,14 +115,10 @@ class FandomWiki:
with open(css_filename, "wb") as outfile: with open(css_filename, "wb") as outfile:
outfile.write(response.content) outfile.write(response.content)
console.print( console.print("[[bold green]CSS[/bold green]] {}".format(css_filename))
"[[bold green]CSS[/bold green]] {}".format(css_filename)
)
def save_img(self, img_url: str): def save_img(self, img_url: str):
filename = self.images_dir.joinpath( filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
Path(img_url.split("/revision")[0]).name
)
if not filename.exists(): if not filename.exists():
response = requests.get(img_url, stream=True) response = requests.get(img_url, stream=True)
response.raise_for_status() response.raise_for_status()
@ -142,9 +141,7 @@ class FandomWiki:
img_tags = soup.find_all("img") img_tags = soup.find_all("img")
img_urls = [img["src"] for img in img_tags if "src" in img.attrs] img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
clean_img_urls = [ clean_img_urls = [
x x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x
for x in img_urls
if "breezewiki" not in x and "Wordmark" not in x
] ]
for img_url in clean_img_urls: for img_url in clean_img_urls:
@ -182,21 +179,15 @@ class FandomWiki:
"/{}/wiki".format(self.name) "/{}/wiki".format(self.name)
): ):
link_basename = link.get("href").partition("/wiki/")[2] link_basename = link.get("href").partition("/wiki/")[2]
link["href"] = "{}/{}.html".format( link["href"] = "{}/{}.html".format(self.site_dir, link_basename)
self.site_dir, link_basename
)
with open(filename, "w") as outfile: with open(filename, "w") as outfile:
outfile.write(soup.prettify()) outfile.write(soup.prettify())
console.print( console.print("[[bold green]HTML[/bold green]] {}".format(filename))
"[[bold green]HTML[/bold green]] {}".format(filename)
)
else: else:
console.print( console.print(
"[[bold yellow]HTML (EXISTS)[/bold yellow]] {}".format( "[[bold yellow]HTML (EXISTS)[/bold yellow]] {}".format(filename)
filename
)
) )
def fetch_all_pages(self, hop1_urls: list): def fetch_all_pages(self, hop1_urls: list):
@ -214,14 +205,10 @@ class FandomWiki:
if self.images_dir.joinpath(f).is_file() if self.images_dir.joinpath(f).is_file()
] ]
img_archive_filename = "{}-{}.tar.xz".format( img_archive_filename = "{}-{}.tar.xz".format(self.images_dir, timestamp)
self.images_dir, timestamp
)
with Progress() as progress: with Progress() as progress:
task = progress.add_task( task = progress.add_task("[cyan]Archiving images...", total=len(img_files))
"[cyan]Archiving images...", total=len(img_files)
)
with tarfile.open(img_archive_filename, "w:xz") as tar: with tarfile.open(img_archive_filename, "w:xz") as tar:
for img_file in img_files: for img_file in img_files:
@ -238,8 +225,7 @@ class FandomWiki:
web_files = [ web_files = [
f f
for f in self.site_dir.iterdir() for f in self.site_dir.iterdir()
if self.site_dir.joinpath(f).is_file if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir()
or self.site_dir.joinpath(f).is_dir()
] ]
web_archive_filename = "{}-{}.tar.gz".format(self.site_dir, timestamp) web_archive_filename = "{}-{}.tar.gz".format(self.site_dir, timestamp)
@ -272,39 +258,13 @@ def archive_site(name: str):
def usage_message(): def usage_message():
supported_wikis = [
"cyberpunk",
"dishonored",
"dragonage",
"forgottenrealms",
"masseffect",
"residentevil",
]
wiki_tree = Tree("[green]Fandom Wikis")
for wiki in supported_wikis:
wiki_tree.add(wiki)
console.print("Usage:\n\tarchive-fandom-wiki [[italic]name[/italic]]\n") console.print("Usage:\n\tarchive-fandom-wiki [[italic]name[/italic]]\n")
console.print("Example:\n\tarchive-fandom-wiki dishonored\n") console.print("Example:\n\tarchive-fandom-wiki dishonored\n")
console.print(wiki_tree) console.print("All wikis on fandom.com are supported.")
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) > 1: if len(sys.argv) > 1:
match sys.argv[1]: archive_site(sys.argv[1])
case "cyberpunk":
archive_site("cyberpunk")
case "dishonored":
archive_site("dishonored")
case "dragonage":
archive_site("dragonage")
case "forgottenrealms":
archive_site("forgottenrealms")
case "masseffect":
archive_site("masseffect")
case "residentevil":
archive_site("residentevil")
case _:
usage_message()
else: else:
usage_message() usage_message()