Created
June 18, 2020 14:04
-
-
Save loretoparisi/a84f6c00ba64c8072f25592f993d6028 to your computer and use it in GitHub Desktop.
Dataset Statistics with Python Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# @author loretoparisi at gmail dot com | |
# Copyright (c) 2020 Loreto Parisi | |
# | |
### built-in | |
import argparse | |
import json | |
### dependencies | |
# parallel pandas - https://github.com/modin-project/modin | |
#import pandas as pd | |
import modin.pandas as pd | |
import numpy as np | |
### local | |
from util import NumpyEncoder | |
# a list of keys from which to generatea breakdown | |
BREAKDOWN_KEYS = [ 'language' ] | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--path', '-p', | |
action='store', | |
type=str, | |
help='Dataset file path', | |
default='', | |
required=True) | |
parser.add_argument('--output', '-o', | |
action='store', | |
type=str, | |
help='JSON file output path', | |
default='', | |
required=False) | |
parser.add_argument('--breakdown', '-bd', | |
action='store', | |
type=str, | |
help='Dataset file path', | |
default= ','.join(BREAKDOWN_KEYS), | |
required=False) | |
args = parser.parse_args() | |
stats = {} # holds df stats | |
stats['keys'] = {} | |
df = pd.read_csv(args.path, sep='\t') | |
# prepare data holder | |
for col in df.columns: | |
if col not in stats['keys']: | |
stats['keys'][col] = {} | |
stats['count'] = len(df) | |
# calculate stats | |
for col in df.columns: | |
stats['keys'][col]['count'] = len(df[col].groupby(df[col])) | |
stats['keys'][col]['unique'] = len(df[col].groupby(df[col]).nunique()) | |
stats['keys'][col]['null'] = df[col].isnull().sum() | |
# breakdown stats | |
if col in args.breakdown.split(','): | |
stats['keys'][col]['freq'] = {} | |
freq = df[col].value_counts() | |
for index,key in enumerate(freq.index.tolist()): | |
stats['keys'][col]['freq'][key] = freq[index] | |
# col sorted mean value grouped by col | |
if 'lang' in df.columns: | |
if 'score' in df.columns: | |
sf = df.dropna(subset=['score']) | |
mean = sf.groupby('lang')['score'].mean().sort_values(ascending=False) | |
perc = sf.groupby('lang')['score'].quantile(np.linspace(.1, 1, 9, 0)) | |
median = sf.groupby('lang')['score'].median().sort_values(ascending=False) | |
stats['keys']['lang']['mean'] = {} | |
stats['keys']['lang']['median'] = {} | |
stats['keys']['lang']['perc'] = {} | |
for index, row in perc.iterrows(): | |
stats['keys']['lang']['perc'][index[0]] = {} | |
# percentile by lang | |
for index, row in perc.iterrows(): | |
pp = round(index[1],1) | |
stats['keys']['lang']['perc'][index[0]][pp] = round(row['score'],3) | |
for index,key in enumerate(mean.index.tolist()): | |
stats['keys']['lang']['mean'][key] = round(mean[index], 3) | |
stats['keys']['lang']['median'][key] = round(median[index], 3) | |
json_out = json.dumps(stats, indent=4, cls=NumpyEncoder) | |
if args.output: | |
with open(args.output, "w") as outfile: | |
outfile.write(json_out) | |
else: | |
print( json_out ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Please look here for
NumpyEncoder
class.