Created
March 5, 2021 13:28
-
-
Save simoncozens/e003aa6f4059096d23914db46ddd399c to your computer and use it in GitHub Desktop.
A quick and dirty bit of python to dump out headlines containing certain characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from urllib.parse import urlencode | |
| languages = ["en"] | |
| doneglyphs = " ABCDEFGHIJKLMNOPRQSTUVXYZabcdefghijklmnopqrstuvwxyz0123456789.,-" | |
| API_TOKEN = "Use your own, not mine." | |
| def get_a_page_of_headlines(page): | |
| params = urlencode( | |
| {"api_token": API_TOKEN, "languages": ",".join(languages), "page": page} | |
| ) | |
| url = "https://api.marketaux.com/v1/news/all?{}".format(params) | |
| r = requests.get(url) | |
| r.raise_for_status() | |
| json = r.json() | |
| return ( | |
| [x.get("title") for x in json["data"]], | |
| json["meta"]["found"] / json["meta"]["returned"], | |
| ) | |
| def filter_nicely(headline): | |
| # We could make exceptions for special cases here - check if we have | |
| # punctuation in our supported list and drop punctuation if not, etc. | |
| # But for now, the simple "do we have all the characters in the headline?" | |
| if all((g in doneglyphs) for g in headline): | |
| return headline | |
| page = 1 | |
| while True: | |
| headlines, maxpage = get_a_page_of_headlines(page) | |
| for headline in headlines: | |
| if filter_nicely(headline): | |
| print(headline) | |
| page = page + 1 | |
| if page > maxpage: | |
| break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment