Last active
July 11, 2022 18:12
-
-
Save cbuntain/27109e525ca867d966350fbc63f8bcfc to your computer and use it in GitHub Desktop.
CrisisFACTS Direct Download
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import requests | |
import pandas as pd | |
### Necessary Credentials | |
credentials = { | |
"institution": "University of Maryland, College Park", # University, Company or Public Agency Name | |
"contactname": "Cody Buntain", # Your Name | |
"email": "[email protected]", # A contact email address | |
"institutiontype": "Research" # Either 'Research', 'Industry', or 'Public Sector' | |
} | |
### What event and date do you want? | |
data = { | |
'eventID': "007", | |
'requestDate': "2020-08-27", | |
} | |
# Code below populates all_content with this event+day pair | |
all_content = [] | |
url_base = "http://demos.terrier.org/crisisfacts/" | |
## Register with the Terrier server | |
data.update(credentials) | |
resp = requests.post(url_base + "register", json=data) | |
resp_dict = resp.json() | |
## Using the access key, consume data from the stream | |
### Continue until we have | |
run_count = 0 | |
while resp.status_code == 200: | |
print("Run:", run_count) | |
# Stream data down | |
## Note that we need a high timeout here (>30 seconds seems to work for me) | |
resp = requests.get(url_base + "stream", params={"accessKey": resp_dict["accessKey"]}, timeout=60) | |
# Go from bytes to a string, since we're expecting JSON data | |
data = resp.content.decode("utf8") | |
# If no data, we've finished | |
if len(data) == 0: | |
print("Datastream exhausted") | |
break | |
# Convert from string to JSON | |
parsed_content = json.loads(data) | |
# And extend the all_content array with this new data | |
all_content.extend(parsed_content) | |
run_count += 1 | |
# Convert documents to Pandas DataFrame | |
df = pd.DataFrame(all_content) | |
df["sourceType"].value_counts() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment