Created
June 18, 2021 20:48
-
-
Save danelliottster/12b5c3b27c61442f0b3d444fb6d6cb64 to your computer and use it in GitHub Desktop.
Collect all values for each attribute found in a set of json values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os , json | |
# uses the flatten library (available via GitHub) | |
from flatten_json import flatten | |
IGNORE_LIST = [ 'SeasonInfo|boundaries' ] | |
IGNORE_LIST_ADVANCED = [ 'LastModifedDate' , 'GrowerId' , 'FarmId' , 'FieldId' , 'FieldName' , 'FarmName' , 'seasonid' , 'seasongroupid' , 'metricid' , 'metricgroupid' , 'FieldCenterLon' , 'FieldCenterLat' , 'fertilizerApplication|values|dateofApplication' , 'fertilizerApplication|values|N' , 'fertilizerApplication|values|amountOfProductApplied' , 'fertilizerApplication|values|nApplied' , 'fertilizerApplication|values|pApplied' , 'cropprotectionplan|cropprotectionplandetails|dateofapplication' , 'planting|plantingDate' , 'harvest|fieldArea' , 'harvest|plantedArea' , 'harvest|harvest_yield|Cutting|yield' , 'SeasonInfo|fieldName' , 'SeasonInfo|plantingDate' , 'SeasonInfo|acres' , 'SeasonInfo|PreviousSeasonGroupID' , 'SeasonInfo|soil' , 'SeasonInfo|slopeLength' , 'SeasonInfo|nearbysurfacewater' , 'SeasonInfo|huc12WatershedCode', 'SeasonInfo|huc12WatershedName' , 'TownShip' , 'Range' , 'fpc_manid' , 'datecreated' ] | |
# | |
# create a list of keynames | |
# condense arrays into a single element | |
# | |
fh = open( 'sample.json , 'r' ) | |
test_json = json.load( fh ) | |
fh.close() | |
sample_json = flatten( test_json[0] , '|' ) | |
base_values = {} | |
for key_name in sample_json.keys() : | |
key_path = key_name.split( '|' ) | |
key_path_tmp = filter( lambda x: not x.isnumeric() , key_path ) | |
key_path_final = '|'.join( key_path_tmp ) | |
if key_path_final not in base_values : | |
base_values[ key_path_final ] = [] | |
for root , dirs , files in os.walk( 'directory_with_json_files' ) : | |
path = root.split( os.sep ) | |
for file in files : | |
if 'json' in file and file != 'unique_values.json' and file != 'value_counts.json': | |
print( path , file ) | |
fh = open( root+'/'+file , 'r' ) | |
test_json = json.load( fh ) | |
fh.close() | |
for field_json in test_json : | |
tmp_json = flatten( field_json , '|' ) | |
for key_name in tmp_json.keys() : | |
key_path = key_name.split( '|' ) | |
key_path_tmp = filter( lambda x: not x.isnumeric() , key_path ) | |
key_path_final = '|'.join( key_path_tmp ) | |
if key_name not in IGNORE_LIST : | |
if key_path_final not in base_values : | |
base_values[ key_path_final ] = [] | |
if type( tmp_json[ key_name ] ) == list : | |
base_values[ key_path_final ] += tmp_json[ key_name ] | |
else : | |
base_values[ key_path_final ] += [ tmp_json[ key_name ] ] | |
# | |
# what values are seen in the data | |
# | |
all_values = {} | |
for key_name in base_values : | |
all_values[ key_name ] = list( set ( base_values[ key_name ] ) ) | |
# | |
# provide potential values and stats for each field | |
# | |
selected_value_counts = {} | |
for key_name in base_values : | |
if key_name not in IGNORE_LIST_ADVANCED : | |
selected_value_counts[ key_name ] = [] | |
for value in all_values[ key_name ] : | |
selected_value_counts[ key_name ] += [ { 'value' : value , 'count' : base_values[ key_name ].count( value ) } ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment