print-wp-sources/print_wp_sources.py

23 lines
555 B
Python
Raw Permalink Normal View History

2024-12-04 19:34:45 +01:00
import sys
2024-12-06 23:54:43 +01:00
import requests
2024-12-04 19:34:45 +01:00
from bs4 import BeautifulSoup
def main():
if len(sys.argv) != 2:
2024-12-06 23:54:43 +01:00
exit("Usage: print-wp-sources ARTICLE_URL")
article_url = sys.argv[1]
2024-12-04 19:34:45 +01:00
2024-12-06 23:54:43 +01:00
text = requests.get(article_url).text
2024-12-04 19:34:45 +01:00
soup = BeautifulSoup(text, "html.parser")
for link in soup.find_all("a", attrs={"class": "external text"}):
2024-12-06 23:54:43 +01:00
if "wikimediafoundation.org" not in link.get(
"href"
) and "foundation.wikimedia.org" not in link.get("href"):
print(link.get("href"))
2024-12-04 19:34:45 +01:00
if __name__ == "__main__":
main()