print-links/print_links.py

24 lines
463 B
Python
Raw Normal View History

2024-12-06 03:13:51 +01:00
#!/usr/bin/env python3
import sys
import requests
2024-12-11 08:46:56 +01:00
from courlan import extract_links, is_navigation_page, is_not_crawlable
2024-12-06 03:13:51 +01:00
def main():
if len(sys.argv) != 2:
exit("Usage: print-links URL")
2024-12-11 08:46:56 +01:00
url = sys.argv[1]
2024-12-06 03:13:51 +01:00
text = requests.get(url).text
2024-12-11 08:46:56 +01:00
links = set(extract_links(text, url))
2024-12-06 03:13:51 +01:00
for link in links:
2024-12-11 08:46:56 +01:00
if not is_navigation_page(link) and not is_not_crawlable(link):
print(link)
2024-12-06 03:13:51 +01:00
if __name__ == "__main__":
main()