Skip to content

Instantly share code, notes, and snippets.

@simoncozens
Created March 5, 2021 13:28
Show Gist options
  • Save simoncozens/e003aa6f4059096d23914db46ddd399c to your computer and use it in GitHub Desktop.
Save simoncozens/e003aa6f4059096d23914db46ddd399c to your computer and use it in GitHub Desktop.
A quick and dirty bit of python to dump out headlines containing certain characters
import requests
from urllib.parse import urlencode
languages = ["en"]
doneglyphs = " ABCDEFGHIJKLMNOPRQSTUVXYZabcdefghijklmnopqrstuvwxyz0123456789.,-"
API_TOKEN = "Use your own, not mine."
def get_a_page_of_headlines(page):
params = urlencode(
{"api_token": API_TOKEN, "languages": ",".join(languages), "page": page}
)
url = "https://api.marketaux.com/v1/news/all?{}".format(params)
r = requests.get(url)
r.raise_for_status()
json = r.json()
return (
[x.get("title") for x in json["data"]],
json["meta"]["found"] / json["meta"]["returned"],
)
def filter_nicely(headline):
# We could make exceptions for special cases here - check if we have
# punctuation in our supported list and drop punctuation if not, etc.
# But for now, the simple "do we have all the characters in the headline?"
if all((g in doneglyphs) for g in headline):
return headline
page = 1
while True:
headlines, maxpage = get_a_page_of_headlines(page)
for headline in headlines:
if filter_nicely(headline):
print(headline)
page = page + 1
if page > maxpage:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment