mirror of
https://codeberg.org/hyperreal/archive-fandom-wiki
synced 2024-11-01 16:43:07 +01:00
feat: support all wikis on fandom.com
This commit is contained in:
parent
6794d04d32
commit
aefa0db24b
@ -33,6 +33,16 @@ class FandomWiki:
|
|||||||
if not self.images_dir.exists():
|
if not self.images_dir.exists():
|
||||||
self.images_dir.mkdir()
|
self.images_dir.mkdir()
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(self.canonical_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.HTTPError as http_err:
|
||||||
|
console.print("Oops. Something went wrong. Likely one of the following:")
|
||||||
|
console.print("(1) The wiki you requested does not exist.")
|
||||||
|
console.print("(2) You typed the name of the wiki incorrectly.")
|
||||||
|
console.print("(3) The server hosting that wiki is down for some reason.")
|
||||||
|
console.print(f"HTTP error: {http_err}")
|
||||||
|
|
||||||
def get_hop0_urls(self) -> list:
|
def get_hop0_urls(self) -> list:
|
||||||
starting_url = "{}/wiki/Local_Sitemap".format(self.canonical_url)
|
starting_url = "{}/wiki/Local_Sitemap".format(self.canonical_url)
|
||||||
hop0_urls = list()
|
hop0_urls = list()
|
||||||
@ -41,14 +51,11 @@ class FandomWiki:
|
|||||||
response = requests.get(starting_url)
|
response = requests.get(starting_url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
mw_allpages_nav = soup.find_all(
|
mw_allpages_nav = soup.find_all("div", {"class": "mw-allpages-nav"})[0]
|
||||||
"div", {"class": "mw-allpages-nav"}
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
len(mw_allpages_nav.find_all("a")) < 2
|
len(mw_allpages_nav.find_all("a")) < 2
|
||||||
and "Next page"
|
and "Next page" not in mw_allpages_nav.find_all("a")[0].get_text()
|
||||||
not in mw_allpages_nav.find_all("a")[0].get_text()
|
|
||||||
):
|
):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -81,13 +88,9 @@ class FandomWiki:
|
|||||||
if "Local_Sitemap" not in item.get(
|
if "Local_Sitemap" not in item.get(
|
||||||
"href"
|
"href"
|
||||||
) and "Special:" not in item.get("href"):
|
) and "Special:" not in item.get("href"):
|
||||||
new_url = "{}{}".format(
|
new_url = "{}{}".format(self.breezewiki_url, item.get("href"))
|
||||||
self.breezewiki_url, item.get("href")
|
|
||||||
)
|
|
||||||
hop1_urls.append(new_url)
|
hop1_urls.append(new_url)
|
||||||
console.print(
|
console.print("[[bold]HOP 1[/bold]] {}".format(new_url))
|
||||||
"[[bold]HOP 1[/bold]] {}".format(new_url)
|
|
||||||
)
|
|
||||||
|
|
||||||
return hop1_urls
|
return hop1_urls
|
||||||
|
|
||||||
@ -112,14 +115,10 @@ class FandomWiki:
|
|||||||
with open(css_filename, "wb") as outfile:
|
with open(css_filename, "wb") as outfile:
|
||||||
outfile.write(response.content)
|
outfile.write(response.content)
|
||||||
|
|
||||||
console.print(
|
console.print("[[bold green]CSS[/bold green]] {}".format(css_filename))
|
||||||
"[[bold green]CSS[/bold green]] {}".format(css_filename)
|
|
||||||
)
|
|
||||||
|
|
||||||
def save_img(self, img_url: str):
|
def save_img(self, img_url: str):
|
||||||
filename = self.images_dir.joinpath(
|
filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
|
||||||
Path(img_url.split("/revision")[0]).name
|
|
||||||
)
|
|
||||||
if not filename.exists():
|
if not filename.exists():
|
||||||
response = requests.get(img_url, stream=True)
|
response = requests.get(img_url, stream=True)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@ -142,9 +141,7 @@ class FandomWiki:
|
|||||||
img_tags = soup.find_all("img")
|
img_tags = soup.find_all("img")
|
||||||
img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
|
img_urls = [img["src"] for img in img_tags if "src" in img.attrs]
|
||||||
clean_img_urls = [
|
clean_img_urls = [
|
||||||
x
|
x for x in img_urls if "breezewiki" not in x and "Wordmark" not in x
|
||||||
for x in img_urls
|
|
||||||
if "breezewiki" not in x and "Wordmark" not in x
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for img_url in clean_img_urls:
|
for img_url in clean_img_urls:
|
||||||
@ -182,21 +179,15 @@ class FandomWiki:
|
|||||||
"/{}/wiki".format(self.name)
|
"/{}/wiki".format(self.name)
|
||||||
):
|
):
|
||||||
link_basename = link.get("href").partition("/wiki/")[2]
|
link_basename = link.get("href").partition("/wiki/")[2]
|
||||||
link["href"] = "{}/{}.html".format(
|
link["href"] = "{}/{}.html".format(self.site_dir, link_basename)
|
||||||
self.site_dir, link_basename
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(filename, "w") as outfile:
|
with open(filename, "w") as outfile:
|
||||||
outfile.write(soup.prettify())
|
outfile.write(soup.prettify())
|
||||||
|
|
||||||
console.print(
|
console.print("[[bold green]HTML[/bold green]] {}".format(filename))
|
||||||
"[[bold green]HTML[/bold green]] {}".format(filename)
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
console.print(
|
console.print(
|
||||||
"[[bold yellow]HTML (EXISTS)[/bold yellow]] {}".format(
|
"[[bold yellow]HTML (EXISTS)[/bold yellow]] {}".format(filename)
|
||||||
filename
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def fetch_all_pages(self, hop1_urls: list):
|
def fetch_all_pages(self, hop1_urls: list):
|
||||||
@ -214,14 +205,10 @@ class FandomWiki:
|
|||||||
if self.images_dir.joinpath(f).is_file()
|
if self.images_dir.joinpath(f).is_file()
|
||||||
]
|
]
|
||||||
|
|
||||||
img_archive_filename = "{}-{}.tar.xz".format(
|
img_archive_filename = "{}-{}.tar.xz".format(self.images_dir, timestamp)
|
||||||
self.images_dir, timestamp
|
|
||||||
)
|
|
||||||
|
|
||||||
with Progress() as progress:
|
with Progress() as progress:
|
||||||
task = progress.add_task(
|
task = progress.add_task("[cyan]Archiving images...", total=len(img_files))
|
||||||
"[cyan]Archiving images...", total=len(img_files)
|
|
||||||
)
|
|
||||||
|
|
||||||
with tarfile.open(img_archive_filename, "w:xz") as tar:
|
with tarfile.open(img_archive_filename, "w:xz") as tar:
|
||||||
for img_file in img_files:
|
for img_file in img_files:
|
||||||
@ -238,8 +225,7 @@ class FandomWiki:
|
|||||||
web_files = [
|
web_files = [
|
||||||
f
|
f
|
||||||
for f in self.site_dir.iterdir()
|
for f in self.site_dir.iterdir()
|
||||||
if self.site_dir.joinpath(f).is_file
|
if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir()
|
||||||
or self.site_dir.joinpath(f).is_dir()
|
|
||||||
]
|
]
|
||||||
|
|
||||||
web_archive_filename = "{}-{}.tar.gz".format(self.site_dir, timestamp)
|
web_archive_filename = "{}-{}.tar.gz".format(self.site_dir, timestamp)
|
||||||
@ -272,39 +258,13 @@ def archive_site(name: str):
|
|||||||
|
|
||||||
|
|
||||||
def usage_message():
|
def usage_message():
|
||||||
supported_wikis = [
|
|
||||||
"cyberpunk",
|
|
||||||
"dishonored",
|
|
||||||
"dragonage",
|
|
||||||
"forgottenrealms",
|
|
||||||
"masseffect",
|
|
||||||
"residentevil",
|
|
||||||
]
|
|
||||||
wiki_tree = Tree("[green]Fandom Wikis")
|
|
||||||
for wiki in supported_wikis:
|
|
||||||
wiki_tree.add(wiki)
|
|
||||||
|
|
||||||
console.print("Usage:\n\tarchive-fandom-wiki [[italic]name[/italic]]\n")
|
console.print("Usage:\n\tarchive-fandom-wiki [[italic]name[/italic]]\n")
|
||||||
console.print("Example:\n\tarchive-fandom-wiki dishonored\n")
|
console.print("Example:\n\tarchive-fandom-wiki dishonored\n")
|
||||||
console.print(wiki_tree)
|
console.print("All wikis on fandom.com are supported.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
match sys.argv[1]:
|
archive_site(sys.argv[1])
|
||||||
case "cyberpunk":
|
|
||||||
archive_site("cyberpunk")
|
|
||||||
case "dishonored":
|
|
||||||
archive_site("dishonored")
|
|
||||||
case "dragonage":
|
|
||||||
archive_site("dragonage")
|
|
||||||
case "forgottenrealms":
|
|
||||||
archive_site("forgottenrealms")
|
|
||||||
case "masseffect":
|
|
||||||
archive_site("masseffect")
|
|
||||||
case "residentevil":
|
|
||||||
archive_site("residentevil")
|
|
||||||
case _:
|
|
||||||
usage_message()
|
|
||||||
else:
|
else:
|
||||||
usage_message()
|
usage_message()
|
||||||
|
Loading…
Reference in New Issue
Block a user