Created
April 4, 2017 14:31
-
-
Save mbrenig/61b53bd7d022b2536fff8ed6b8e2acbb to your computer and use it in GitHub Desktop.
csv_entropy.py -- Find the entropy in CSVs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
csv_entropy.py | |
Author: MBJ (2017) | |
Purpose: Command line utility to assess the most "Informative" columns in a CSV file. | |
Detail: | |
* Reads the contents of a CSV file | |
- Assumes header row is at the top | |
- Data is in rows 2 onwards | |
* For each column calculates the shannon entropy of the row | |
* Writes input CSV to output CSV (to stdout) but Row N+2 indicates the entropy | |
""" | |
import csv | |
import math | |
import sys | |
from collections import defaultdict | |
def usage(): | |
""" Print usage and exit """ | |
print("Usage: csv_entropy.py yourfile.csv results.csv") | |
sys.exit() | |
def entropy(value_counts): | |
""" Input is a db of value counts e.g. {True: 10, False:100, NULL: 5} """ | |
# Compute the shannon entropy of a column | |
size = sum(value_counts.values()) | |
h_entropy = 0.0 | |
for _, count in value_counts.items(): | |
proportion = (count/size) | |
h_entropy -= proportion * math.log(proportion, 2) | |
return h_entropy | |
def analyse(infpath, outfpath): | |
""" Analyse input file, write output file """ | |
header = None | |
track = defaultdict(lambda: defaultdict(int)) | |
output = [] | |
# Read input | |
with open(infpath, 'r') as inf: | |
csvrows = csv.reader(inf) | |
for row in csvrows: | |
if not header: | |
header = row | |
else: | |
for colix, value in enumerate(row): | |
track[colix][value] += 1 | |
output.append(row) | |
entropies = {ix: entropy(db) for (ix, db) in track.items()} | |
# Write output | |
with open(outfpath, 'w') as ouf: | |
csvwriter = csv.writer(ouf, lineterminator='\n') | |
for row in output: | |
csvwriter.writerow(row) | |
csvwriter.writerow([]) | |
entropy_row = [entropies[colix] for colix in sorted(entropies.keys())] | |
csvwriter.writerow(entropy_row) | |
if __name__ == '__main__': | |
if len(sys.argv) != 3: | |
usage() | |
input_file, output_file = sys.argv[1], sys.argv[2] | |
print("Analysing {} and writing {}".format(input_file, output_file)) | |
analyse(input_file, output_file) | |
print("That's all folks!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment