Created
April 13, 2022 10:40
-
-
Save Steboss89/0ea6ad962e0c53d54a7d5d0427112149 to your computer and use it in GitHub Desktop.
Relate the word's occurrence to the hmean of normcdf of the total and class rate
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def normcdf(x): | |
| return norm.cdf(x, x.mean(), x.std()) | |
| # compute the rate of a word: word_occurrence_old_test/total | |
| term_freq_df.loc[:,'old_rate'] = term_freq_df[0] * 1./term_freq_df['total'] | |
| # rate the word appear in a class, in this case old testament word_occurrence_old_test/total_old_test | |
| term_freq_df.loc[:,'old_freq_pct'] = term_freq_df[0] * 1./term_freq_df[0].sum() | |
| # combine the total rate and the class rate with the harmonic mean, to weight over most unique and specific words | |
| term_freq_df.loc[:,'old_hmean'] = term_freq_df.apply(lambda x: (hmean([x['old_rate'], x['old_freq_pct']]) if x['old_rate'] > 0 and x['old_freq_pct'] > 0 else 0), axis=1) | |
| # where old_rate or old_freq_pct lies in the distribution in terms of cumulative manner. | |
| term_freq_df.loc[:,'old_rate_normcdf'] = normcdf(term_freq_df['old_rate']) | |
| term_freq_df.loc[:,'old_freq_pct_normcdf'] = normcdf(term_freq_df['old_freq_pct']) | |
| # take the harmonic mean of the cumulative distribution of both to get the real different words | |
| term_freq_df.loc[:,'old_normcdf_hmean'] = hmean([term_freq_df['old_rate_normcdf'], term_freq_df['old_freq_pct_normcdf']]) | |
| term_freq_df.loc[:,'new_rate'] = term_freq_df[1] * 1./term_freq_df['total'] | |
| term_freq_df.loc[:,'new_freq_pct'] = term_freq_df[1] * 1./term_freq_df[1].sum() | |
| term_freq_df.loc[:,'new_hmean'] = term_freq_df.apply(lambda x: (hmean([x['new_rate'], x['new_freq_pct']]) if x['new_rate'] > 0 and x['new_freq_pct'] > 0 else 0), axis=1) | |
| term_freq_df.loc[:,'new_rate_normcdf'] = normcdf(term_freq_df['new_rate']) | |
| term_freq_df.loc[:,'new_freq_pct_normcdf'] = normcdf(term_freq_df['new_freq_pct']) | |
| term_freq_df.loc[:,'new_normcdf_hmean'] = hmean([term_freq_df['new_rate_normcdf'], term_freq_df['new_freq_pct_normcdf']]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment