This commit is contained in:
Jeffrey Serio 2024-12-11 01:46:56 -06:00
parent f30b52ebba
commit 14f5c60dac
3 changed files with 8 additions and 13 deletions

View File

@ -1,28 +1,21 @@
#!/usr/bin/env python3
import re
import sys
import requests
from bs4 import BeautifulSoup
from courlan import extract_links, is_navigation_page, is_not_crawlable
def main():
if len(sys.argv) != 2:
exit("Usage: print-links URL")
url = sys.argv[1]
text = requests.get(url).text
soup = BeautifulSoup(text, "html.parser")
links = set()
for link in soup.find_all("a"):
links.add(link.get("href"))
links = set(extract_links(text, url))
for link in links:
if link.endswith("/") or link.endswith(".html"):
match_str = re.search(r"\d{4}\/\d{2}\/\d{2}\/\w", link)
if match_str:
if not is_navigation_page(link) and not is_not_crawlable(link):
print(link)

View File

@ -1,6 +1,6 @@
[project]
name = "print-links"
version = "0.1"
version = "0.2"
authors = [
{ name="Jeffrey Serio", email="hyperreal@fedoraproject.org" },
]

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
courlan==1.3.2
Requests==2.32.3