Skip to content

Instantly share code, notes, and snippets.

@lmiller1990
Last active August 12, 2024 11:24
Show Gist options
  • Save lmiller1990/f04f03af7d4ee0189a677a0163e912f3 to your computer and use it in GitHub Desktop.
Save lmiller1990/f04f03af7d4ee0189a677a0163e912f3 to your computer and use it in GitHub Desktop.
Pandas / Python / R
import pandas as pd
import json
from camel_converter import to_snake
def read(jsonfile):
with open(jsonfile, "r") as f:
return json.loads(f.read())
def pp(dic):
print(json.dumps(dic, indent=4))
def flatten_json_1(nested_json, parent_key="", sep="_"):
items = {}
for key, value in nested_json.items():
new_key = to_snake(f"{parent_key}{sep}{key}" if parent_key else key)
if isinstance(value, dict):
# If value is a dict, recurse into it
items.update(flatten_json_1(value, new_key, sep=sep))
elif isinstance(value, list):
# Handle each item in the list as part of the same path without indexing
# Assumes structure doesn't require distinction between items in arrays
for item in value:
items.update(flatten_json_1(item, new_key, sep=sep))
else:
# Simply set the value at the calculated key
items[new_key] = value
return items
def flatten_json2(nested_json, parent_key="", sep="_"):
items = {}
for key, value in nested_json.items():
# Only update new_key if we're not dealing with a list item
if parent_key and not isinstance(nested_json, list):
new_key = f"{parent_key}{sep}{to_snake(key)}"
else:
new_key = to_snake(key)
if isinstance(value, dict):
items.update(flatten_json2(value, new_key, sep=sep))
elif isinstance(value, list):
for item in value:
# Not adding any new key component for items in a list; propagate the current key
items.update(
flatten_json2(
item, new_key if not key == "items" else parent_key, sep=sep
)
)
else:
items[new_key] = value
return items
data = read("data.json")
# flat1 = [flatten_json_1(group) for group in data["data"]["groups"]]
# print(json.dumps(flat1, indent=4))
data2 = read("data_2.json")
flat2 = [flatten_json2(group) for group in data2["data"]["groups"]]
print(json.dumps(flat2, indent=4))
# print(flat)
#
import pandas as pd
# def extract_result_matrix(online_section_data):
# # Convert the dictionary to DataFrame
# df = pd.json_normalize(online_section_data)
# # Filter and process data
# result_matrix = (
# df[df["type"] == "RESULT_MATRIX"] # Filter rows
# .explode("components") # Similar to unnest() in R but for one level
# .reset_index(drop=True)["components"]
# .apply(
# pd.Series
# ) # Expanding the 'components' dictionaries into separate columns
# .query("componentCode == 'RESULT_MATRIX'") # Additional filtering
# .explode("data") # 'Explode' or 'unnest' the data list
# .reset_index(drop=True)["data"]
# .apply(pd.Series) # Expanding the 'data' dictionaries
# .explode("groups")
# .reset_index(drop=True)["groups"]
# .apply(pd.Series) # Expanding the 'groups' dictionaries
# .explode("items")
# .reset_index(drop=True)
# .pipe(
# lambda x: pd.json_normalize(x["items"])
# ) # Normalizing the items field into a flat table
# .pipe(
# lambda x: x.dropna(axis=1, how="all")
# ) # Optional: Drop columns that are entirely NA
# .pipe(
# lambda x: x.rename(columns=lambda c: c.replace(".", "_"))
# ) # Clean column names
# )
# return result_matrix
def extract_result_matrix(online_section_data):
# Convert the dictionary to DataFrame
df = pd.json_normalize(online_section_data)
# Filter and process data
result_matrix = (
df[df["type"] == "RESULT_MATRIX"] # Filter rows
.explode("components") # Similar to unnest() in R but for one level
.reset_index(drop=True)["components"]
.apply(
pd.Series
) # Expanding the 'components' dictionaries into separate columns
.query("componentCode == 'RESULT_MATRIX'") # Additional filtering
.explode("data") # 'Explode' or 'unnest' the data list
)
return result_matrix
# Example usage
online_section_data = {
"type": "RESULT_MATRIX",
"components": [
{
"componentCode": "RESULT_MATRIX",
"data": [
{
"groups": [
{
"items": [
{"key": "value1", "value": 123},
{"key": "value2", "value": 456},
]
}
]
}
],
}
],
}
# online_section_data = read("data_2.json")
# df_result = extract_result_matrix(online_section_data)
# print(df_result)
{
"data": {
"groups": [
{
"label": "Diversity",
"seq": 1,
"items": [
{
"healthAssociation": null,
"markerId": "8ef98b48-4684-4ee8-af47-70331d9f08db",
"marker": "Microbial Diversity",
"markerReference": "Microbial Diversity",
"markerDisplayName": "Microbial Diversity",
"type": "DIVERSITY",
"value": "3.87",
"rawValue": 3.874,
"unit": null,
"prevalence": "100.00%",
"prevalenceLevel": {
"label": "Very common",
"definition": "{x | x >= 90%}"
},
"detoxRetoxRisk": null,
"markerRangeChart": {
"type": "MARKER_RANGE_CHART",
"markerChartType": "REFERENCE_RANGE_TYPE_2",
"markerChartName": "One way descending (Bad -> Good -> Good)",
"dataPoints": {
"average": {
"value": 0.0,
"widthMultiple": 0.3
}
}
}
}
]
}
]
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment