This commit is contained in:
Jeffrey Serio 2024-12-11 01:46:56 -06:00
parent f30b52ebba
commit 14f5c60dac
3 changed files with 8 additions and 13 deletions

View File

@ -1,28 +1,21 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import re
import sys import sys
import requests import requests
from bs4 import BeautifulSoup from courlan import extract_links, is_navigation_page, is_not_crawlable
def main(): def main():
if len(sys.argv) != 2: if len(sys.argv) != 2:
exit("Usage: print-links URL") exit("Usage: print-links URL")
url = sys.argv[1] url = sys.argv[1]
text = requests.get(url).text text = requests.get(url).text
soup = BeautifulSoup(text, "html.parser") links = set(extract_links(text, url))
links = set()
for link in soup.find_all("a"):
links.add(link.get("href"))
for link in links: for link in links:
if link.endswith("/") or link.endswith(".html"): if not is_navigation_page(link) and not is_not_crawlable(link):
match_str = re.search(r"\d{4}\/\d{2}\/\d{2}\/\w", link)
if match_str:
print(link) print(link)

View File

@ -1,6 +1,6 @@
[project] [project]
name = "print-links" name = "print-links"
version = "0.1" version = "0.2"
authors = [ authors = [
{ name="Jeffrey Serio", email="hyperreal@fedoraproject.org" }, { name="Jeffrey Serio", email="hyperreal@fedoraproject.org" },
] ]

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
courlan==1.3.2
Requests==2.32.3