Skip to content

Instantly share code, notes, and snippets.

@brandomr
Created July 13, 2023 13:54
Show Gist options
  • Save brandomr/79a47e2d151a11ed09840dbb6b9e8415 to your computer and use it in GitHub Desktop.
Save brandomr/79a47e2d151a11ed09840dbb6b9e8415 to your computer and use it in GitHub Desktop.
MIT data profiling to TDS
import io
import pandas
import requests
import json
##########################
# generate dataset metadata
# I assume this will come from the HMI during user upload
##########################
dataset = {
"username": "Adam Smith",
"name": "COVID-19 Forecast Hub Ground Truth Data",
"description": "COVID-19 case incidents, hospitalization incidents and cumulative deaths provided by COVID-19 Forecast Hub.",
"file_names": [
"forecast_hub_demo_data.csv"
],
"source": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md",
"dataset_url": "https://github.com/reichlab/covid19-forecast-hub/"
}
# set openai key
openai_key = open('openai_key', 'r').read().strip()
# assume you've got forecast_hub_demo_data.csv (my example file) loaded as a pandas dataframe
# The MIT extraction service works best with just the top few lines of a dataset.
# 3 or 4 lines seems to be the sweet spot.
# not sure how uncharted will provide the CSV but if it's large we don't want to have to read in the whole thing
# you might have to tweak what's here to get it to be efficient
buffer = io.StringIO()
df.to_csv(buffer, index=False)
file_sample = buffer.getvalue()
# assume you've got some text document (e.g. about the data)
doc = 'my description about my data goes here'
######################################################
# Now we do the actual profiling!
######################################################
# Here we perform our first call to the MIT service
mit_url = 'http://100.26.10.46'
resp = requests.post(
url=f"{mit_url}/annotation/link_dataset_col_to_dkg",
params={"csv_str": file_sample, "doc": doc, "gpt_key": openai_key},
)
mit_groundings = resp.json()
# here we perform our 2nd call to the MIT service
resp = requests.post(
url=f"{mit_url}/annotation/upload_file_extract/?gpt_key={openai_key}",
files={"file": file_sample},
)
resp.json()
mit_annotations = {a['name']: a for a in resp.json()}
#######################################
# processing the results from MIT into the format
# expected by TDS
#######################################
columns = []
for c in df.columns:
annotations = mit_annotations.get(c, {}).get("text_annotations", [])
# Skip any single empty strings that are sometimes returned and drop extra items that are sometimes included (usually the string 'class')
groundings = {g[0]: g[1] for g in mit_groundings.get(c, None).get('dkg_groundings', None) \
if g and isinstance(g, list)}
col = {
"name": c,
"data_type": "float",
"description": annotations[0].strip(),
"annotations": [],
"metadata": {},
"grounding": {
"identifiers": groundings,
},
}
columns.append(col)
dataset['columns'] = columns
dataset['metadata'] = {
"documents": [
{
"url": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md",
"title": "README: Ground truth data for the COVID-19 Forecast Hub"
}
]
}
#######################################
# adding dataset to TDS after profiling
#######################################
resp = requests.post(f"{tds_url}/datasets", json=dataset)
dataset_id = resp.json()['id']
resp.json()
# Let's get the pre-signed upload URL
query = {'filename': dataset['file_names'][0]}
resp = requests.get(f"{tds_url}/datasets/{dataset_id}/upload-url", params=query)
upload_url = resp.json()['url']
resp.json()
# now let's upload it
with open('forecast_hub_demo_data.csv', 'rb') as file:
resp = requests.put(upload_url, data=file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment