Last active
August 29, 2015 13:56
-
-
Save danstowell/9040956 to your computer and use it in GitHub Desktop.
Cross-tabulation analysis of the data comparing HOT OSM building damage annotations against damage annotations from the American Red Cross, at https://github.com/AmericanRedCross/OSM-Assessment/tree/gh-pages/data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os, json | |
import numpy as np | |
# crosstabulate the geojson records for OSM damage indicators vs "observed" (by American Red Cross analysts) damage | |
datadir = '.' | |
fpaths = [ | |
'osm_damage_no', | |
'osm_damage_damaged', | |
'osm_damage_destroyed', | |
'observed_damage_no', | |
'observed_damage_partial', | |
'observed_damage_major', | |
'observed_damage_total', | |
] | |
cross = {} | |
keys = {'osm':[], 'observed':[]} | |
# Here we load the data into a form suitable for crosstabulating | |
for fpath in fpaths: | |
print("Loading %s" % fpath) | |
sourcetype = fpath.split('_')[0] | |
damagetype = fpath.split('_')[2] | |
keys[sourcetype].append(damagetype) | |
with open("%s/%s.geojson" % (datadir, fpath), 'rb') as fp: | |
data = json.load(fp) | |
print(" contains %i items" % len(data["features"])) | |
for item in data["features"]: | |
#print item["properties"] | |
osmid = int(item["properties"]["osm_id"]) | |
if osmid not in cross: | |
cross[osmid] = {} | |
cross[osmid][sourcetype] = damagetype | |
# Now we calc the crosstab counts | |
crosstab = {osmkey:{obskey:0 for obskey in keys['observed']} for osmkey in keys['osm']} | |
unmatched = {sourcetype:{dkey:0 for dkey in dkeys} for sourcetype, dkeys in keys.items()} | |
marginals = {sourcetype:{dkey:0 for dkey in dkeys} for sourcetype, dkeys in keys.items()} | |
for item in cross.values(): | |
if ('observed' in item) and ('osm' in item): | |
crosstab[item['osm']][item['observed']] += 1 | |
elif ('observed' in item): | |
unmatched['observed'][item['observed']] += 1 | |
else: | |
unmatched['osm'][item['osm']] += 1 | |
if ('observed' in item): | |
marginals['observed'][item['observed']] += 1 | |
if ('osm' in item): | |
marginals['osm'][item['osm']] += 1 | |
print('-------------------------------------------') | |
print('MARGINALS:') | |
print marginals | |
print('') | |
print('-------------------------------------------') | |
print('CROSSTABULATION:') | |
print('\t' + '\t'.join(keys['osm'])) | |
for obskey in keys['observed']: | |
print(obskey + '\t' + '\t'.join([str(crosstab[osmkey][obskey]) for osmkey in keys['osm']])) | |
print('') | |
print('-------------------------------------------') | |
print('INFORMATION ANALYSIS:') | |
def entropy(counts): | |
tot = np.sum(counts, dtype=np.float32) | |
val = 0. | |
for count in counts: | |
prob = float(count)/tot | |
val -= prob * np.log2(prob) | |
return val | |
jointbincounts = [] # this is needed to simplify calcing the joint entropy | |
for subset in crosstab.values(): | |
jointbincounts.extend(subset.values()) | |
h_obs = entropy(marginals['observed'].values()) | |
h_osm = entropy(marginals['osm'].values()) | |
h_joint = entropy(jointbincounts) | |
mutualinfo = h_obs + h_osm - h_joint | |
print('Entropy of observed: %g bits' % (h_obs)) | |
print('Entropy of osm: %g bits' % (h_osm)) | |
print('Entropy of joint: %g bits' % (h_joint)) | |
print('Mutual information: %g bits' % (mutualinfo)) | |
print('Mutual information as a proportion of the observed entropy: %g %%' % (100 * mutualinfo/h_obs)) | |
print('') | |
print('-------------------------------------------') | |
print('UNMATCHED:') | |
print unmatched |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment