mirror of
https://codeberg.org/hyperreal/print-links
synced 2025-01-18 08:53:44 +01:00
Refactor
This commit is contained in:
parent
f30b52ebba
commit
14f5c60dac
@ -1,28 +1,21 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from courlan import extract_links, is_navigation_page, is_not_crawlable
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
exit("Usage: print-links URL")
|
||||
|
||||
url = sys.argv[1]
|
||||
|
||||
text = requests.get(url).text
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
|
||||
links = set()
|
||||
for link in soup.find_all("a"):
|
||||
links.add(link.get("href"))
|
||||
links = set(extract_links(text, url))
|
||||
|
||||
for link in links:
|
||||
if link.endswith("/") or link.endswith(".html"):
|
||||
match_str = re.search(r"\d{4}\/\d{2}\/\d{2}\/\w", link)
|
||||
if match_str:
|
||||
if not is_navigation_page(link) and not is_not_crawlable(link):
|
||||
print(link)
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "print-links"
|
||||
version = "0.1"
|
||||
version = "0.2"
|
||||
authors = [
|
||||
{ name="Jeffrey Serio", email="hyperreal@fedoraproject.org" },
|
||||
]
|
||||
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
courlan==1.3.2
|
||||
Requests==2.32.3
|
Loading…
Reference in New Issue
Block a user