diff --git a/print_links.py b/print_links.py index 1a2e8a3..9a00d73 100644 --- a/print_links.py +++ b/print_links.py @@ -1,29 +1,22 @@ #!/usr/bin/env python3 -import re import sys import requests -from bs4 import BeautifulSoup +from courlan import extract_links, is_navigation_page, is_not_crawlable def main(): if len(sys.argv) != 2: exit("Usage: print-links URL") + url = sys.argv[1] - text = requests.get(url).text - soup = BeautifulSoup(text, "html.parser") - - links = set() - for link in soup.find_all("a"): - links.add(link.get("href")) + links = set(extract_links(text, url)) for link in links: - if link.endswith("/") or link.endswith(".html"): - match_str = re.search(r"\d{4}\/\d{2}\/\d{2}\/\w", link) - if match_str: - print(link) + if not is_navigation_page(link) and not is_not_crawlable(link): + print(link) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index bda16d1..ed27e89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "print-links" -version = "0.1" +version = "0.2" authors = [ { name="Jeffrey Serio", email="hyperreal@fedoraproject.org" }, ] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..804a430 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +courlan==1.3.2 +Requests==2.32.3