From 835f4eb037f8235991b61fc9742fe0c6e3772e78 Mon Sep 17 00:00:00 2001 From: Jeffrey Serio Date: Fri, 6 Dec 2024 16:54:43 -0600 Subject: [PATCH] Use article URL and include Wikinews --- README.md | 28 +++++++++++++++++++++------- print_wp_sources.py | 15 ++++++++------- pyproject.toml | 6 +++--- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 3bdcacd..b2c15e6 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,33 @@ # print-wp-sources -This program just prints the sources of the given Wikipedia article to standard output. +This program just prints the sources of the given Wikipedia or Wikinews article to standard output. ## Installation ``` shell -pipx install print-wp-sources --include-deps +pipx install print-wp-sources ``` ## Usage -Use the Wikipedia article's name as the argument to `print-wp-sources`. For example, if the article's URL is `https://en.wikipedia.org/wiki/Automatic_negative_thoughts`, then the argument for `print-wp-sources` would be `"Automatic_negative_thoughts"`. +Use the Wikipedia or WikiNews article's URL as the argument. ``` shell -print-wp-sources "Automatic_negative_thoughts" +print-wp-sources "https://en.wikinews.org/wiki/Israel-Lebanon_ceasefire_faces_several_violations" + +Output: +https://en.wikinews.org/w/index.php?title=Special:Log&type=review&page=Israel-Lebanon_ceasefire_faces_several_violations +https://en.wikinews.org/w/index.php?title=Israel-Lebanon_ceasefire_faces_several_violations&action=info#mw-flaggedrevs-action-info-pages-waiting-for-review +https://thedefensepost.com/2024/12/06/israel-strikes-hezbollah-smuggling-routes/ +https://www.cbsnews.com/news/israel-war-palestinians-ceasefire-hezbollah-lebanon-strained-by-strikes/ +https://www.cbsnews.com/news/israel-hezbollah-ceasefire-claims-of-violations-on-day-2-war-hamas-gaza/ +https://theconversation.com/why-israel-and-hezbollah-reached-a-ceasefire-now-and-what-it-means-for-israel-lebanon-biden-and-trump-244700 +https://www.abc.net.au/news/2024-11-25/lebanon-ceasefire-pending-israeli-response/104642856 +https://www.aljazeera.com/news/liveblog/2024/11/25/live-destruction-in-tel-aviv-beirut-amid-hezbollah-israel-missile-fire +``` + +```shell +print-wp-sources "https://en.wikipedia.org/wiki/Automatic_negative_thoughts" Output: https://pubmed.ncbi.nlm.nih.gov/26431418 @@ -27,11 +41,11 @@ https://pubmed.ncbi.nlm.nih.gov/6630686 https://wikimediafoundation.org/ ``` -> Note: make sure to use quotes around the article name in the argument to `print-wp-sources`. +> Note: make sure to use quotes around the article URL in the argument to `print-wp-sources`. One can also easily pipe the output to a file. ``` shell -print-wp-sources "Automatic_negative_thoughts" > sources.txt -print-wp-sources "Python_(programming_language)" | tee sources.txt +print-wp-sources "https://en.wikipedia.org/wiki/Automatic_negative_thoughts" > sources.txt +print-wp-sources "https://en.wikipedia.org/wiki/Python_(programming_language)" | tee sources.txt ``` diff --git a/print_wp_sources.py b/print_wp_sources.py index cfe6d3e..efb9e71 100644 --- a/print_wp_sources.py +++ b/print_wp_sources.py @@ -1,20 +1,21 @@ import sys -from urllib.parse import unquote -from urllib.request import urlopen +import requests from bs4 import BeautifulSoup def main(): if len(sys.argv) != 2: - exit("Usage: print-wp-sources ARTICLE_NAME") - article = sys.argv[1] + exit("Usage: print-wp-sources ARTICLE_URL") + article_url = sys.argv[1] - url = f"https://en.wikipedia.org/wiki/{article}" - text = urlopen(url).read() + text = requests.get(article_url).text soup = BeautifulSoup(text, "html.parser") for link in soup.find_all("a", attrs={"class": "external text"}): - print(unquote(link.get("href"))) + if "wikimediafoundation.org" not in link.get( + "href" + ) and "foundation.wikimedia.org" not in link.get("href"): + print(link.get("href")) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 26c5818..055b258 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,10 @@ [project] name = "print-wp-sources" -version = "0.4" +version = "0.5" authors = [ { name="Jeffrey Serio", email="hyperreal@fedoraproject.org" }, ] -description = "Print sources from Wikipedia articles to stdout." +description = "Print sources from Wikipedia or Wikinews articles to stdout." readme = "README.md" requires-python = ">=3.10" classifiers = [ @@ -12,7 +12,7 @@ classifiers = [ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", "Operating System :: OS Independent", ] -dependencies = ["beautifulsoup4>=4.12.3"] +dependencies = ["beautifulsoup4>=4.12.3", "requests>=2.32.3"] [project.scripts] print-wp-sources = "print_wp_sources:main"