2024-12-04 19:34:45 +01:00
|
|
|
import sys
|
|
|
|
|
2024-12-06 23:54:43 +01:00
|
|
|
import requests
|
2024-12-04 19:34:45 +01:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
if len(sys.argv) != 2:
|
2024-12-06 23:54:43 +01:00
|
|
|
exit("Usage: print-wp-sources ARTICLE_URL")
|
|
|
|
article_url = sys.argv[1]
|
2024-12-04 19:34:45 +01:00
|
|
|
|
2024-12-06 23:54:43 +01:00
|
|
|
text = requests.get(article_url).text
|
2024-12-04 19:34:45 +01:00
|
|
|
soup = BeautifulSoup(text, "html.parser")
|
|
|
|
for link in soup.find_all("a", attrs={"class": "external text"}):
|
2024-12-06 23:54:43 +01:00
|
|
|
if "wikimediafoundation.org" not in link.get(
|
|
|
|
"href"
|
|
|
|
) and "foundation.wikimedia.org" not in link.get("href"):
|
|
|
|
print(link.get("href"))
|
2024-12-04 19:34:45 +01:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|