Last active
January 13, 2023 15:44
-
-
Save brienna/bbb381e84649a55ce1c8647665943e3b to your computer and use it in GitHub Desktop.
Requests article data from The New York Times Archive API over a period time
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def send_request(date): | |
'''Sends a request to the NYT Archive API for given date.''' | |
base_url = 'https://api.nytimes.com/svc/archive/v1/' | |
url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + YOUR_API_KEY | |
response = requests.get(url).json() | |
time.sleep(6) | |
return response | |
def is_valid(article, date): | |
'''An article is only worth checking if it is in range, and has a headline.''' | |
is_in_range = date > start and date < end | |
has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys() | |
return is_in_range and has_headline | |
def parse_response(response): | |
'''Parses and returns response as pandas data frame.''' | |
data = {'headline': [], | |
'date': [], | |
'doc_type': [], | |
'material_type': [], | |
'section': [], | |
'keywords': []} | |
articles = response['response']['docs'] | |
for article in articles: # For each article, make sure it falls within our date range | |
date = dateutil.parser.parse(article['pub_date']).date() | |
if is_valid(article, date): | |
data['date'].append(date) | |
data['headline'].append(article['headline']['main']) | |
if 'section' in article: | |
data['section'].append(article['section_name']) | |
else: | |
data['section'].append(None) | |
data['doc_type'].append(article['document_type']) | |
if 'type_of_material' in article: | |
data['material_type'].append(article['type_of_material']) | |
else: | |
data['material_type'].append(None) | |
keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject'] | |
data['keywords'].append(keywords) | |
return pd.DataFrame(data) | |
def get_data(dates): | |
'''Sends and parses request/response to/from NYT Archive API for given dates.''' | |
total = 0 | |
print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1])) | |
if not os.path.exists('headlines'): | |
os.mkdir('headlines') | |
for date in dates: | |
response = send_request(date) | |
df = parse_response(response) | |
total += len(df) | |
df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False) | |
print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...') | |
print('Number of articles collected: ' + str(total)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment