Last active
September 14, 2021 14:01
-
-
Save shubhamagarwal92/13e2c41d09156c3810740d7697a883d1 to your computer and use it in GitHub Desktop.
Pandas helper functions for analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def read_json_to_df(file_path): | |
# df = pd.read_json(path_or_buf=file_path,orient='records',lines=True) | |
df = pd.read_json(path_or_buf=file_path, orient='records') | |
return df | |
def read_json_list_to_df(json_list): | |
df = pd.DataFrame.from_records(json_list) | |
return df | |
def count_unique(df, col_name): | |
""" Count unique values in a df column """ | |
count = df[col_name].nunique() | |
return count | |
def get_unique_column_values(df,col_name): | |
""" Returns unique values """ | |
return df[col_name].unique() | |
def get_column_stats(df,column_name,to_dict = False): | |
if to_dict: | |
return df[column_name].value_counts().to_dict() | |
else: | |
# return df[column_name].value_counts() | |
c = df[column_name].value_counts(dropna=False) | |
p = df[column_name].value_counts(dropna=False, normalize=True)*100 | |
m = pd.concat([c,p], axis=1, keys=['counts', '%']) | |
return m | |
def get_pandas_percentile(df): | |
df['words'].describe(percentiles=[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]) | |
return | |
def flatten_json_column(df,col_name='utterance'): | |
temp_df = json_normalize(df[col_name].tolist()) | |
df.reset_index(drop=True,inplace=True) | |
df = df.join(temp_df).drop(col_name, axis=1) | |
return df | |
def get_column_stats(df,column_name,to_dict = False): | |
if to_dict: | |
return df[column_name].value_counts().to_dict() | |
else: | |
return df[column_name].value_counts() | |
def findFiles(path): | |
return glob.glob(path) | |
def get_column_names(df): | |
return df.columns.values | |
def get_value_row_column(df,index,column_name): | |
return df.get_value(index,column_name) | |
def flatten_dic_column(df,col_name): | |
df_new= pd.concat([df.drop([col_name], axis=1), df[col_name].apply(pd.Series)], axis=1) | |
return df_new | |
def append_df(df, df_to_append, ignore_index=True): | |
new_df = df.append(df_to_append,ignore_index=ignore_index) | |
return new_df | |
def write_df_to_csv(df,outputFilePath): | |
df.to_csv(outputFilePath, sep=str('\t'),quotechar=str('"'), index=False, header=True) | |
def write_df_to_json(df,outputFilePath): | |
df.to_json(path_or_buf=outputFilePath,orient='records',lines=True) | |
def save_df_pickle(df,output_file): | |
df.to_pickle(output_file) | |
def get_unique_column_values(df,col_name): | |
""" Returns unique values """ | |
return df[col_name].unique() | |
def count_unique(df, col_name): | |
""" Count unique values in a df column """ | |
count = df[col_name].nunique() | |
return count | |
def print_analysis(_list, key="relevance", _type="relevance"): | |
df = pd.DataFrame(_list) | |
df.columns = [key] | |
print("\n-----------------------------------") | |
print("Total unique {} responses".format(_type)) | |
print(count_unique(df,key)) | |
print("\n-----------------------------------") | |
print("Stats for {} responses".format(_type)) | |
print(get_column_stats(df,key)) | |
print("\n-----------------------------------") | |
print("Number of {} responses".format(_type)) | |
print(df[key].describe()) | |
return | |
# Binarize | |
# https://datascience.stackexchange.com/questions/11797/split-a-list-of-values-into-columns-of-a-dataframe | |
# Fill na | |
# https://datascience.stackexchange.com/questions/15924/how-can-i-fill-nan-values-in-a-pandas-data-frame?rq=1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment