Skip to content

Instantly share code, notes, and snippets.

@danelliottster
Created June 18, 2021 20:48
Show Gist options
  • Save danelliottster/12b5c3b27c61442f0b3d444fb6d6cb64 to your computer and use it in GitHub Desktop.
Save danelliottster/12b5c3b27c61442f0b3d444fb6d6cb64 to your computer and use it in GitHub Desktop.
Collect all values for each attribute found in a set of json values
import sys
import os , json
# uses the flatten library (available via GitHub)
from flatten_json import flatten
IGNORE_LIST = [ 'SeasonInfo|boundaries' ]
IGNORE_LIST_ADVANCED = [ 'LastModifedDate' , 'GrowerId' , 'FarmId' , 'FieldId' , 'FieldName' , 'FarmName' , 'seasonid' , 'seasongroupid' , 'metricid' , 'metricgroupid' , 'FieldCenterLon' , 'FieldCenterLat' , 'fertilizerApplication|values|dateofApplication' , 'fertilizerApplication|values|N' , 'fertilizerApplication|values|amountOfProductApplied' , 'fertilizerApplication|values|nApplied' , 'fertilizerApplication|values|pApplied' , 'cropprotectionplan|cropprotectionplandetails|dateofapplication' , 'planting|plantingDate' , 'harvest|fieldArea' , 'harvest|plantedArea' , 'harvest|harvest_yield|Cutting|yield' , 'SeasonInfo|fieldName' , 'SeasonInfo|plantingDate' , 'SeasonInfo|acres' , 'SeasonInfo|PreviousSeasonGroupID' , 'SeasonInfo|soil' , 'SeasonInfo|slopeLength' , 'SeasonInfo|nearbysurfacewater' , 'SeasonInfo|huc12WatershedCode', 'SeasonInfo|huc12WatershedName' , 'TownShip' , 'Range' , 'fpc_manid' , 'datecreated' ]
#
# create a list of keynames
# condense arrays into a single element
#
fh = open( 'sample.json , 'r' )
test_json = json.load( fh )
fh.close()
sample_json = flatten( test_json[0] , '|' )
base_values = {}
for key_name in sample_json.keys() :
key_path = key_name.split( '|' )
key_path_tmp = filter( lambda x: not x.isnumeric() , key_path )
key_path_final = '|'.join( key_path_tmp )
if key_path_final not in base_values :
base_values[ key_path_final ] = []
for root , dirs , files in os.walk( 'directory_with_json_files' ) :
path = root.split( os.sep )
for file in files :
if 'json' in file and file != 'unique_values.json' and file != 'value_counts.json':
print( path , file )
fh = open( root+'/'+file , 'r' )
test_json = json.load( fh )
fh.close()
for field_json in test_json :
tmp_json = flatten( field_json , '|' )
for key_name in tmp_json.keys() :
key_path = key_name.split( '|' )
key_path_tmp = filter( lambda x: not x.isnumeric() , key_path )
key_path_final = '|'.join( key_path_tmp )
if key_name not in IGNORE_LIST :
if key_path_final not in base_values :
base_values[ key_path_final ] = []
if type( tmp_json[ key_name ] ) == list :
base_values[ key_path_final ] += tmp_json[ key_name ]
else :
base_values[ key_path_final ] += [ tmp_json[ key_name ] ]
#
# what values are seen in the data
#
all_values = {}
for key_name in base_values :
all_values[ key_name ] = list( set ( base_values[ key_name ] ) )
#
# provide potential values and stats for each field
#
selected_value_counts = {}
for key_name in base_values :
if key_name not in IGNORE_LIST_ADVANCED :
selected_value_counts[ key_name ] = []
for value in all_values[ key_name ] :
selected_value_counts[ key_name ] += [ { 'value' : value , 'count' : base_values[ key_name ].count( value ) } ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment