Created
July 13, 2023 13:54
-
-
Save brandomr/79a47e2d151a11ed09840dbb6b9e8415 to your computer and use it in GitHub Desktop.
MIT data profiling to TDS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import pandas | |
import requests | |
import json | |
########################## | |
# generate dataset metadata | |
# I assume this will come from the HMI during user upload | |
########################## | |
dataset = { | |
"username": "Adam Smith", | |
"name": "COVID-19 Forecast Hub Ground Truth Data", | |
"description": "COVID-19 case incidents, hospitalization incidents and cumulative deaths provided by COVID-19 Forecast Hub.", | |
"file_names": [ | |
"forecast_hub_demo_data.csv" | |
], | |
"source": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md", | |
"dataset_url": "https://github.com/reichlab/covid19-forecast-hub/" | |
} | |
# set openai key | |
openai_key = open('openai_key', 'r').read().strip() | |
# assume you've got forecast_hub_demo_data.csv (my example file) loaded as a pandas dataframe | |
# The MIT extraction service works best with just the top few lines of a dataset. | |
# 3 or 4 lines seems to be the sweet spot. | |
# not sure how uncharted will provide the CSV but if it's large we don't want to have to read in the whole thing | |
# you might have to tweak what's here to get it to be efficient | |
buffer = io.StringIO() | |
df.to_csv(buffer, index=False) | |
file_sample = buffer.getvalue() | |
# assume you've got some text document (e.g. about the data) | |
doc = 'my description about my data goes here' | |
###################################################### | |
# Now we do the actual profiling! | |
###################################################### | |
# Here we perform our first call to the MIT service | |
mit_url = 'http://100.26.10.46' | |
resp = requests.post( | |
url=f"{mit_url}/annotation/link_dataset_col_to_dkg", | |
params={"csv_str": file_sample, "doc": doc, "gpt_key": openai_key}, | |
) | |
mit_groundings = resp.json() | |
# here we perform our 2nd call to the MIT service | |
resp = requests.post( | |
url=f"{mit_url}/annotation/upload_file_extract/?gpt_key={openai_key}", | |
files={"file": file_sample}, | |
) | |
resp.json() | |
mit_annotations = {a['name']: a for a in resp.json()} | |
####################################### | |
# processing the results from MIT into the format | |
# expected by TDS | |
####################################### | |
columns = [] | |
for c in df.columns: | |
annotations = mit_annotations.get(c, {}).get("text_annotations", []) | |
# Skip any single empty strings that are sometimes returned and drop extra items that are sometimes included (usually the string 'class') | |
groundings = {g[0]: g[1] for g in mit_groundings.get(c, None).get('dkg_groundings', None) \ | |
if g and isinstance(g, list)} | |
col = { | |
"name": c, | |
"data_type": "float", | |
"description": annotations[0].strip(), | |
"annotations": [], | |
"metadata": {}, | |
"grounding": { | |
"identifiers": groundings, | |
}, | |
} | |
columns.append(col) | |
dataset['columns'] = columns | |
dataset['metadata'] = { | |
"documents": [ | |
{ | |
"url": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md", | |
"title": "README: Ground truth data for the COVID-19 Forecast Hub" | |
} | |
] | |
} | |
####################################### | |
# adding dataset to TDS after profiling | |
####################################### | |
resp = requests.post(f"{tds_url}/datasets", json=dataset) | |
dataset_id = resp.json()['id'] | |
resp.json() | |
# Let's get the pre-signed upload URL | |
query = {'filename': dataset['file_names'][0]} | |
resp = requests.get(f"{tds_url}/datasets/{dataset_id}/upload-url", params=query) | |
upload_url = resp.json()['url'] | |
resp.json() | |
# now let's upload it | |
with open('forecast_hub_demo_data.csv', 'rb') as file: | |
resp = requests.put(upload_url, data=file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment