mirror of
https://codeberg.org/hyperreal/print-links
synced 2025-01-18 08:53:44 +01:00
Refactor
This commit is contained in:
parent
f30b52ebba
commit
14f5c60dac
@ -1,29 +1,22 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from courlan import extract_links, is_navigation_page, is_not_crawlable
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
exit("Usage: print-links URL")
|
exit("Usage: print-links URL")
|
||||||
|
|
||||||
url = sys.argv[1]
|
url = sys.argv[1]
|
||||||
|
|
||||||
text = requests.get(url).text
|
text = requests.get(url).text
|
||||||
soup = BeautifulSoup(text, "html.parser")
|
links = set(extract_links(text, url))
|
||||||
|
|
||||||
links = set()
|
|
||||||
for link in soup.find_all("a"):
|
|
||||||
links.add(link.get("href"))
|
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
if link.endswith("/") or link.endswith(".html"):
|
if not is_navigation_page(link) and not is_not_crawlable(link):
|
||||||
match_str = re.search(r"\d{4}\/\d{2}\/\d{2}\/\w", link)
|
print(link)
|
||||||
if match_str:
|
|
||||||
print(link)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "print-links"
|
name = "print-links"
|
||||||
version = "0.1"
|
version = "0.2"
|
||||||
authors = [
|
authors = [
|
||||||
{ name="Jeffrey Serio", email="hyperreal@fedoraproject.org" },
|
{ name="Jeffrey Serio", email="hyperreal@fedoraproject.org" },
|
||||||
]
|
]
|
||||||
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
courlan==1.3.2
|
||||||
|
Requests==2.32.3
|
Loading…
Reference in New Issue
Block a user