Last active
August 12, 2024 11:24
-
-
Save lmiller1990/f04f03af7d4ee0189a677a0163e912f3 to your computer and use it in GitHub Desktop.
Pandas / Python / R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import json | |
from camel_converter import to_snake | |
def read(jsonfile): | |
with open(jsonfile, "r") as f: | |
return json.loads(f.read()) | |
def pp(dic): | |
print(json.dumps(dic, indent=4)) | |
def flatten_json_1(nested_json, parent_key="", sep="_"): | |
items = {} | |
for key, value in nested_json.items(): | |
new_key = to_snake(f"{parent_key}{sep}{key}" if parent_key else key) | |
if isinstance(value, dict): | |
# If value is a dict, recurse into it | |
items.update(flatten_json_1(value, new_key, sep=sep)) | |
elif isinstance(value, list): | |
# Handle each item in the list as part of the same path without indexing | |
# Assumes structure doesn't require distinction between items in arrays | |
for item in value: | |
items.update(flatten_json_1(item, new_key, sep=sep)) | |
else: | |
# Simply set the value at the calculated key | |
items[new_key] = value | |
return items | |
def flatten_json2(nested_json, parent_key="", sep="_"): | |
items = {} | |
for key, value in nested_json.items(): | |
# Only update new_key if we're not dealing with a list item | |
if parent_key and not isinstance(nested_json, list): | |
new_key = f"{parent_key}{sep}{to_snake(key)}" | |
else: | |
new_key = to_snake(key) | |
if isinstance(value, dict): | |
items.update(flatten_json2(value, new_key, sep=sep)) | |
elif isinstance(value, list): | |
for item in value: | |
# Not adding any new key component for items in a list; propagate the current key | |
items.update( | |
flatten_json2( | |
item, new_key if not key == "items" else parent_key, sep=sep | |
) | |
) | |
else: | |
items[new_key] = value | |
return items | |
data = read("data.json") | |
# flat1 = [flatten_json_1(group) for group in data["data"]["groups"]] | |
# print(json.dumps(flat1, indent=4)) | |
data2 = read("data_2.json") | |
flat2 = [flatten_json2(group) for group in data2["data"]["groups"]] | |
print(json.dumps(flat2, indent=4)) | |
# print(flat) | |
# | |
import pandas as pd | |
# def extract_result_matrix(online_section_data): | |
# # Convert the dictionary to DataFrame | |
# df = pd.json_normalize(online_section_data) | |
# # Filter and process data | |
# result_matrix = ( | |
# df[df["type"] == "RESULT_MATRIX"] # Filter rows | |
# .explode("components") # Similar to unnest() in R but for one level | |
# .reset_index(drop=True)["components"] | |
# .apply( | |
# pd.Series | |
# ) # Expanding the 'components' dictionaries into separate columns | |
# .query("componentCode == 'RESULT_MATRIX'") # Additional filtering | |
# .explode("data") # 'Explode' or 'unnest' the data list | |
# .reset_index(drop=True)["data"] | |
# .apply(pd.Series) # Expanding the 'data' dictionaries | |
# .explode("groups") | |
# .reset_index(drop=True)["groups"] | |
# .apply(pd.Series) # Expanding the 'groups' dictionaries | |
# .explode("items") | |
# .reset_index(drop=True) | |
# .pipe( | |
# lambda x: pd.json_normalize(x["items"]) | |
# ) # Normalizing the items field into a flat table | |
# .pipe( | |
# lambda x: x.dropna(axis=1, how="all") | |
# ) # Optional: Drop columns that are entirely NA | |
# .pipe( | |
# lambda x: x.rename(columns=lambda c: c.replace(".", "_")) | |
# ) # Clean column names | |
# ) | |
# return result_matrix | |
def extract_result_matrix(online_section_data): | |
# Convert the dictionary to DataFrame | |
df = pd.json_normalize(online_section_data) | |
# Filter and process data | |
result_matrix = ( | |
df[df["type"] == "RESULT_MATRIX"] # Filter rows | |
.explode("components") # Similar to unnest() in R but for one level | |
.reset_index(drop=True)["components"] | |
.apply( | |
pd.Series | |
) # Expanding the 'components' dictionaries into separate columns | |
.query("componentCode == 'RESULT_MATRIX'") # Additional filtering | |
.explode("data") # 'Explode' or 'unnest' the data list | |
) | |
return result_matrix | |
# Example usage | |
online_section_data = { | |
"type": "RESULT_MATRIX", | |
"components": [ | |
{ | |
"componentCode": "RESULT_MATRIX", | |
"data": [ | |
{ | |
"groups": [ | |
{ | |
"items": [ | |
{"key": "value1", "value": 123}, | |
{"key": "value2", "value": 456}, | |
] | |
} | |
] | |
} | |
], | |
} | |
], | |
} | |
# online_section_data = read("data_2.json") | |
# df_result = extract_result_matrix(online_section_data) | |
# print(df_result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"data": { | |
"groups": [ | |
{ | |
"label": "Diversity", | |
"seq": 1, | |
"items": [ | |
{ | |
"healthAssociation": null, | |
"markerId": "8ef98b48-4684-4ee8-af47-70331d9f08db", | |
"marker": "Microbial Diversity", | |
"markerReference": "Microbial Diversity", | |
"markerDisplayName": "Microbial Diversity", | |
"type": "DIVERSITY", | |
"value": "3.87", | |
"rawValue": 3.874, | |
"unit": null, | |
"prevalence": "100.00%", | |
"prevalenceLevel": { | |
"label": "Very common", | |
"definition": "{x | x >= 90%}" | |
}, | |
"detoxRetoxRisk": null, | |
"markerRangeChart": { | |
"type": "MARKER_RANGE_CHART", | |
"markerChartType": "REFERENCE_RANGE_TYPE_2", | |
"markerChartName": "One way descending (Bad -> Good -> Good)", | |
"dataPoints": { | |
"average": { | |
"value": 0.0, | |
"widthMultiple": 0.3 | |
} | |
} | |
} | |
} | |
] | |
} | |
] | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment