Skip to content

Instantly share code, notes, and snippets.

@mbrenig
Created April 4, 2017 14:31
Show Gist options
  • Save mbrenig/61b53bd7d022b2536fff8ed6b8e2acbb to your computer and use it in GitHub Desktop.
Save mbrenig/61b53bd7d022b2536fff8ed6b8e2acbb to your computer and use it in GitHub Desktop.
csv_entropy.py -- Find the entropy in CSVs
"""
csv_entropy.py
Author: MBJ (2017)
Purpose: Command line utility to assess the most "Informative" columns in a CSV file.
Detail:
* Reads the contents of a CSV file
- Assumes header row is at the top
- Data is in rows 2 onwards
* For each column calculates the shannon entropy of the row
* Writes input CSV to output CSV (to stdout) but Row N+2 indicates the entropy
"""
import csv
import math
import sys
from collections import defaultdict
def usage():
""" Print usage and exit """
print("Usage: csv_entropy.py yourfile.csv results.csv")
sys.exit()
def entropy(value_counts):
""" Input is a db of value counts e.g. {True: 10, False:100, NULL: 5} """
# Compute the shannon entropy of a column
size = sum(value_counts.values())
h_entropy = 0.0
for _, count in value_counts.items():
proportion = (count/size)
h_entropy -= proportion * math.log(proportion, 2)
return h_entropy
def analyse(infpath, outfpath):
""" Analyse input file, write output file """
header = None
track = defaultdict(lambda: defaultdict(int))
output = []
# Read input
with open(infpath, 'r') as inf:
csvrows = csv.reader(inf)
for row in csvrows:
if not header:
header = row
else:
for colix, value in enumerate(row):
track[colix][value] += 1
output.append(row)
entropies = {ix: entropy(db) for (ix, db) in track.items()}
# Write output
with open(outfpath, 'w') as ouf:
csvwriter = csv.writer(ouf, lineterminator='\n')
for row in output:
csvwriter.writerow(row)
csvwriter.writerow([])
entropy_row = [entropies[colix] for colix in sorted(entropies.keys())]
csvwriter.writerow(entropy_row)
if __name__ == '__main__':
if len(sys.argv) != 3:
usage()
input_file, output_file = sys.argv[1], sys.argv[2]
print("Analysing {} and writing {}".format(input_file, output_file))
analyse(input_file, output_file)
print("That's all folks!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment