Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Created April 13, 2022 10:40
Show Gist options
  • Save Steboss89/0ea6ad962e0c53d54a7d5d0427112149 to your computer and use it in GitHub Desktop.
Save Steboss89/0ea6ad962e0c53d54a7d5d0427112149 to your computer and use it in GitHub Desktop.
Relate the word's occurrence to the hmean of normcdf of the total and class rate
def normcdf(x):
return norm.cdf(x, x.mean(), x.std())
# compute the rate of a word: word_occurrence_old_test/total
term_freq_df.loc[:,'old_rate'] = term_freq_df[0] * 1./term_freq_df['total']
# rate the word appear in a class, in this case old testament word_occurrence_old_test/total_old_test
term_freq_df.loc[:,'old_freq_pct'] = term_freq_df[0] * 1./term_freq_df[0].sum()
# combine the total rate and the class rate with the harmonic mean, to weight over most unique and specific words
term_freq_df.loc[:,'old_hmean'] = term_freq_df.apply(lambda x: (hmean([x['old_rate'], x['old_freq_pct']]) if x['old_rate'] > 0 and x['old_freq_pct'] > 0 else 0), axis=1)
# where old_rate or old_freq_pct lies in the distribution in terms of cumulative manner.
term_freq_df.loc[:,'old_rate_normcdf'] = normcdf(term_freq_df['old_rate'])
term_freq_df.loc[:,'old_freq_pct_normcdf'] = normcdf(term_freq_df['old_freq_pct'])
# take the harmonic mean of the cumulative distribution of both to get the real different words
term_freq_df.loc[:,'old_normcdf_hmean'] = hmean([term_freq_df['old_rate_normcdf'], term_freq_df['old_freq_pct_normcdf']])
term_freq_df.loc[:,'new_rate'] = term_freq_df[1] * 1./term_freq_df['total']
term_freq_df.loc[:,'new_freq_pct'] = term_freq_df[1] * 1./term_freq_df[1].sum()
term_freq_df.loc[:,'new_hmean'] = term_freq_df.apply(lambda x: (hmean([x['new_rate'], x['new_freq_pct']]) if x['new_rate'] > 0 and x['new_freq_pct'] > 0 else 0), axis=1)
term_freq_df.loc[:,'new_rate_normcdf'] = normcdf(term_freq_df['new_rate'])
term_freq_df.loc[:,'new_freq_pct_normcdf'] = normcdf(term_freq_df['new_freq_pct'])
term_freq_df.loc[:,'new_normcdf_hmean'] = hmean([term_freq_df['new_rate_normcdf'], term_freq_df['new_freq_pct_normcdf']])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment