2024-12-06 03:13:51 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import requests
|
2024-12-11 08:46:56 +01:00
|
|
|
from courlan import extract_links, is_navigation_page, is_not_crawlable
|
2024-12-06 03:13:51 +01:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
if len(sys.argv) != 2:
|
|
|
|
exit("Usage: print-links URL")
|
|
|
|
|
2024-12-11 08:46:56 +01:00
|
|
|
url = sys.argv[1]
|
2024-12-06 03:13:51 +01:00
|
|
|
text = requests.get(url).text
|
2024-12-11 08:46:56 +01:00
|
|
|
links = set(extract_links(text, url))
|
2024-12-06 03:13:51 +01:00
|
|
|
|
|
|
|
for link in links:
|
2024-12-11 08:46:56 +01:00
|
|
|
if not is_navigation_page(link) and not is_not_crawlable(link):
|
|
|
|
print(link)
|
2024-12-06 03:13:51 +01:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|