Skip to content

Instantly share code, notes, and snippets.

@loretoparisi
Created June 18, 2020 14:04
Show Gist options
  • Save loretoparisi/a84f6c00ba64c8072f25592f993d6028 to your computer and use it in GitHub Desktop.
Save loretoparisi/a84f6c00ba64c8072f25592f993d6028 to your computer and use it in GitHub Desktop.
Dataset Statistics with Python Pandas
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# @author loretoparisi at gmail dot com
# Copyright (c) 2020 Loreto Parisi
#
### built-in
import argparse
import json
### dependencies
# parallel pandas - https://github.com/modin-project/modin
#import pandas as pd
import modin.pandas as pd
import numpy as np
### local
from util import NumpyEncoder
# a list of keys from which to generatea breakdown
BREAKDOWN_KEYS = [ 'language' ]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--path', '-p',
action='store',
type=str,
help='Dataset file path',
default='',
required=True)
parser.add_argument('--output', '-o',
action='store',
type=str,
help='JSON file output path',
default='',
required=False)
parser.add_argument('--breakdown', '-bd',
action='store',
type=str,
help='Dataset file path',
default= ','.join(BREAKDOWN_KEYS),
required=False)
args = parser.parse_args()
stats = {} # holds df stats
stats['keys'] = {}
df = pd.read_csv(args.path, sep='\t')
# prepare data holder
for col in df.columns:
if col not in stats['keys']:
stats['keys'][col] = {}
stats['count'] = len(df)
# calculate stats
for col in df.columns:
stats['keys'][col]['count'] = len(df[col].groupby(df[col]))
stats['keys'][col]['unique'] = len(df[col].groupby(df[col]).nunique())
stats['keys'][col]['null'] = df[col].isnull().sum()
# breakdown stats
if col in args.breakdown.split(','):
stats['keys'][col]['freq'] = {}
freq = df[col].value_counts()
for index,key in enumerate(freq.index.tolist()):
stats['keys'][col]['freq'][key] = freq[index]
# col sorted mean value grouped by col
if 'lang' in df.columns:
if 'score' in df.columns:
sf = df.dropna(subset=['score'])
mean = sf.groupby('lang')['score'].mean().sort_values(ascending=False)
perc = sf.groupby('lang')['score'].quantile(np.linspace(.1, 1, 9, 0))
median = sf.groupby('lang')['score'].median().sort_values(ascending=False)
stats['keys']['lang']['mean'] = {}
stats['keys']['lang']['median'] = {}
stats['keys']['lang']['perc'] = {}
for index, row in perc.iterrows():
stats['keys']['lang']['perc'][index[0]] = {}
# percentile by lang
for index, row in perc.iterrows():
pp = round(index[1],1)
stats['keys']['lang']['perc'][index[0]][pp] = round(row['score'],3)
for index,key in enumerate(mean.index.tolist()):
stats['keys']['lang']['mean'][key] = round(mean[index], 3)
stats['keys']['lang']['median'][key] = round(median[index], 3)
json_out = json.dumps(stats, indent=4, cls=NumpyEncoder)
if args.output:
with open(args.output, "w") as outfile:
outfile.write(json_out)
else:
print( json_out )
@loretoparisi
Copy link
Author

Please look here for NumpyEncoder class.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment