mirror of
https://codeberg.org/hyperreal/archive-fandom-wiki
synced 2024-11-25 09:33:41 +01:00
Use f-strings instead of .format()
This commit is contained in:
parent
35b5438371
commit
ef29e741f9
@ -16,7 +16,6 @@ import requests
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich.progress import Progress
|
from rich.progress import Progress
|
||||||
from rich.tree import Tree
|
|
||||||
|
|
||||||
console = Console()
|
console = Console()
|
||||||
|
|
||||||
@ -24,17 +23,11 @@ console = Console()
|
|||||||
class FandomWiki:
|
class FandomWiki:
|
||||||
def __init__(self, name: str):
|
def __init__(self, name: str):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.canonical_url = "https://{}.fandom.com".format(name)
|
self.canonical_url = f"https://{name}.fandom.com"
|
||||||
self.breezewiki_url = "https://wiki.hyperreal.coffee/{}".format(name)
|
self.breezewiki_url = f"https://wiki.hyperreal.coffee/{name}"
|
||||||
self.site_dir = Path.cwd().joinpath("{}.fandom.com".format(name))
|
self.site_dir = Path.cwd().joinpath(f"{name}.fandom.com")
|
||||||
self.images_dir = self.site_dir.joinpath("images")
|
self.images_dir = self.site_dir.joinpath("images")
|
||||||
|
|
||||||
if not self.site_dir.exists():
|
|
||||||
self.site_dir.mkdir()
|
|
||||||
|
|
||||||
if not self.images_dir.exists():
|
|
||||||
self.images_dir.mkdir()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.get(self.canonical_url)
|
response = requests.get(self.canonical_url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@ -45,8 +38,14 @@ class FandomWiki:
|
|||||||
console.print("(3) The server hosting that wiki is down for some reason.\n")
|
console.print("(3) The server hosting that wiki is down for some reason.\n")
|
||||||
console.print(f"HTTP error: {http_err}")
|
console.print(f"HTTP error: {http_err}")
|
||||||
|
|
||||||
|
if not self.site_dir.exists():
|
||||||
|
self.site_dir.mkdir()
|
||||||
|
|
||||||
|
if not self.images_dir.exists():
|
||||||
|
self.images_dir.mkdir()
|
||||||
|
|
||||||
def get_hop0_urls(self) -> list:
|
def get_hop0_urls(self) -> list:
|
||||||
starting_url = "{}/wiki/Local_Sitemap".format(self.canonical_url)
|
starting_url = f"{self.canonical_url}/wiki/Local_Sitemap"
|
||||||
hop0_urls = list()
|
hop0_urls = list()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@ -62,18 +61,12 @@ class FandomWiki:
|
|||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
if len(mw_allpages_nav.find_all("a")) < 2:
|
if len(mw_allpages_nav.find_all("a")) < 2:
|
||||||
starting_url = "{}{}".format(
|
starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[0].get('href')}"
|
||||||
self.canonical_url,
|
|
||||||
mw_allpages_nav.find_all("a")[0].get("href"),
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
starting_url = "{}{}".format(
|
starting_url = f"{self.canonical_url}{mw_allpages_nav.find_all('a')[1].get('href')}"
|
||||||
self.canonical_url,
|
|
||||||
mw_allpages_nav.find_all("a")[1].get("href"),
|
|
||||||
)
|
|
||||||
|
|
||||||
hop0_urls.append(starting_url)
|
hop0_urls.append(starting_url)
|
||||||
console.print("[[bold]HOP 0[/bold]] {}".format(starting_url))
|
console.print(f"[[bold]HOP 0[/bold]] {starting_url}")
|
||||||
|
|
||||||
return hop0_urls
|
return hop0_urls
|
||||||
|
|
||||||
@ -90,9 +83,9 @@ class FandomWiki:
|
|||||||
if "Local_Sitemap" not in item.get(
|
if "Local_Sitemap" not in item.get(
|
||||||
"href"
|
"href"
|
||||||
) and "Special:" not in item.get("href"):
|
) and "Special:" not in item.get("href"):
|
||||||
new_url = "{}{}".format(self.breezewiki_url, item.get("href"))
|
new_url = f"{self.breezewiki_url}{item.get('href')}"
|
||||||
hop1_urls.append(new_url)
|
hop1_urls.append(new_url)
|
||||||
console.print("[[bold]HOP 1[/bold]] {}".format(new_url))
|
console.print(f"[[bold]HOP 1[/bold]] {new_url}")
|
||||||
|
|
||||||
return hop1_urls
|
return hop1_urls
|
||||||
|
|
||||||
@ -111,13 +104,11 @@ class FandomWiki:
|
|||||||
response = requests.get(page)
|
response = requests.get(page)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
css_filename = self.site_dir.joinpath(
|
css_filename = self.site_dir.joinpath(f"proxy{css_pages.index(page)}")
|
||||||
"proxy{}.css".format(css_pages.index(page))
|
|
||||||
)
|
|
||||||
with open(css_filename, "wb") as outfile:
|
with open(css_filename, "wb") as outfile:
|
||||||
outfile.write(response.content)
|
outfile.write(response.content)
|
||||||
|
|
||||||
console.print("[[bold green]CSS[/bold green]] {}".format(css_filename))
|
console.print(f"[[bold green]CSS[/bold green]] {css_filename}")
|
||||||
|
|
||||||
def save_img(self, img_url: str):
|
def save_img(self, img_url: str):
|
||||||
filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
|
filename = self.images_dir.joinpath(Path(img_url.split("/revision")[0]).name)
|
||||||
@ -129,11 +120,9 @@ class FandomWiki:
|
|||||||
for chunk in response.iter_content(chunk_size=8192):
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
outfile.write(chunk)
|
outfile.write(chunk)
|
||||||
|
|
||||||
console.print("[[bold green]IMG[/bold green]] {}".format(filename))
|
console.print(f"[[bold green]IMG[/bold green]] {filename}")
|
||||||
else:
|
else:
|
||||||
console.print(
|
console.print(f"[[bold yellow]IMG (EXISTS)[/bold yellow]] {filename}")
|
||||||
"[[bold yellow]IMG (EXISTS)[/bold yellow]] {}".format(filename)
|
|
||||||
)
|
|
||||||
|
|
||||||
def fetch_all_images(self, page_url: str):
|
def fetch_all_images(self, page_url: str):
|
||||||
response = requests.get(page_url)
|
response = requests.get(page_url)
|
||||||
@ -150,7 +139,7 @@ class FandomWiki:
|
|||||||
self.save_img(img_url)
|
self.save_img(img_url)
|
||||||
|
|
||||||
def save_page(self, url: str):
|
def save_page(self, url: str):
|
||||||
filename = self.site_dir.joinpath("{}.html".format(url.split("/")[-1]))
|
filename = self.site_dir.joinpath(f"{url.split('/')[-1]}.html")
|
||||||
if not filename.exists():
|
if not filename.exists():
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@ -168,7 +157,7 @@ class FandomWiki:
|
|||||||
"link",
|
"link",
|
||||||
rel="stylesheet",
|
rel="stylesheet",
|
||||||
type="text/css",
|
type="text/css",
|
||||||
href="proxy{}.css".format(i),
|
href=f"proxy{i}.css",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -178,19 +167,17 @@ class FandomWiki:
|
|||||||
|
|
||||||
for link in soup.find_all("a"):
|
for link in soup.find_all("a"):
|
||||||
if link.get("href") and link.get("href").startswith(
|
if link.get("href") and link.get("href").startswith(
|
||||||
"/{}/wiki".format(self.name)
|
f"/{self.name}/wiki"
|
||||||
):
|
):
|
||||||
link_basename = link.get("href").partition("/wiki/")[2]
|
link_basename = link.get("href").partition("/wiki/")[2]
|
||||||
link["href"] = "{}/{}.html".format(self.site_dir, link_basename)
|
link["href"] = f"{self.site_dir}/{link_basename}.html"
|
||||||
|
|
||||||
with open(filename, "w") as outfile:
|
with open(filename, "w") as outfile:
|
||||||
outfile.write(soup.prettify())
|
outfile.write(soup.prettify())
|
||||||
|
|
||||||
console.print("[[bold green]HTML[/bold green]] {}".format(filename))
|
console.print(f"[[bold green]HTML[/bold green]] {filename}")
|
||||||
else:
|
else:
|
||||||
console.print(
|
console.print(f"[[bold yellow]HTML (EXISTS)[/bold yellow]] {filename}")
|
||||||
"[[bold yellow]HTML (EXISTS)[/bold yellow]] {}".format(filename)
|
|
||||||
)
|
|
||||||
|
|
||||||
def fetch_all_pages(self, hop1_urls: list):
|
def fetch_all_pages(self, hop1_urls: list):
|
||||||
self.save_css()
|
self.save_css()
|
||||||
@ -207,7 +194,7 @@ class FandomWiki:
|
|||||||
if self.images_dir.joinpath(f).is_file()
|
if self.images_dir.joinpath(f).is_file()
|
||||||
]
|
]
|
||||||
|
|
||||||
img_archive_filename = "{}-{}.tar.xz".format(self.images_dir, timestamp)
|
img_archive_filename = f"{self.images_dir}-{timestamp}.tar.xz"
|
||||||
|
|
||||||
with Progress() as progress:
|
with Progress() as progress:
|
||||||
task = progress.add_task("[cyan]Archiving images...", total=len(img_files))
|
task = progress.add_task("[cyan]Archiving images...", total=len(img_files))
|
||||||
@ -230,7 +217,7 @@ class FandomWiki:
|
|||||||
if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir()
|
if self.site_dir.joinpath(f).is_file or self.site_dir.joinpath(f).is_dir()
|
||||||
]
|
]
|
||||||
|
|
||||||
web_archive_filename = "{}-{}.tar.gz".format(self.site_dir, timestamp)
|
web_archive_filename = f"{self.site_dir}-{timestamp}.tar.gz"
|
||||||
|
|
||||||
with Progress() as progress:
|
with Progress() as progress:
|
||||||
task = progress.add_task(
|
task = progress.add_task(
|
||||||
@ -249,8 +236,8 @@ class FandomWiki:
|
|||||||
|
|
||||||
shutil.rmtree(self.site_dir, ignore_errors=True)
|
shutil.rmtree(self.site_dir, ignore_errors=True)
|
||||||
|
|
||||||
console.print("\nTotal web files scraped: {}".format(len(web_files)))
|
console.print(f"\nTotal web files scraped: {len(web_files)}")
|
||||||
console.print("Total images scraped: {}".format(len(img_files)))
|
console.print(f"Total images scraped: {len(img_files)}")
|
||||||
|
|
||||||
|
|
||||||
def archive_site(name: str):
|
def archive_site(name: str):
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "archive-fandom-wiki"
|
name = "archive-fandom-wiki"
|
||||||
version = "0.1.0"
|
version = "0.1.1"
|
||||||
description = "Archive fandom wikis"
|
description = "Archive fandom wikis"
|
||||||
authors = ["Jeffrey Serio <hyperreal@fedoraproject.org>"]
|
authors = ["Jeffrey Serio <hyperreal@fedoraproject.org>"]
|
||||||
license = "GPL-3.0"
|
license = "GPL-3.0"
|
||||||
readme = "README.md"
|
readme = "README.org"
|
||||||
packages = [{include = "archive_fandom_wiki"}]
|
packages = [{include = "archive_fandom_wiki"}]
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
|
Loading…
Reference in New Issue
Block a user