Last active
May 25, 2020 23:22
-
-
Save Olshansk/3f2acd5715725029679d0d9d0430900e to your computer and use it in GitHub Desktop.
Joint Probability Matrices - Create Transition Matrix Function
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def create_joint_probability_matrix(data_GT, data_P, bins): | |
| # https://stackoverflow.com/questions/38931566 | |
| def background_gradient(s, m=None, M=None, cmap='Reds', low=0, high=0): | |
| if m is None: | |
| m = s.min().min() | |
| if M is None: | |
| M = s.max().max() | |
| rng = M - m | |
| norm = colors.Normalize(m - (rng * low), M + (rng * high)) | |
| normed = s.apply(lambda x: norm(x.values)) | |
| cm = plt.cm.get_cmap(cmap) | |
| c = normed.applymap(lambda x: colors.rgb2hex(cm(x))) | |
| ret = c.applymap(lambda x: 'background-color: %s' % x) | |
| return ret | |
| assert(len(data_GT) == len(data_P)) | |
| bucket_GT = pd.cut(data_GT, bins=bins, include_lowest=True, right=True) | |
| bucket_P = pd.cut(data_P, bins=bins, include_lowest=True, right=True) | |
| # Generate a pandas dataframe where the index represents the student number | |
| df_GT = pd.DataFrame({'bucket': bucket_GT}).reset_index() | |
| df_P = pd.DataFrame({'bucket': bucket_P}).reset_index() | |
| # Merged the actual predicted grades | |
| merged_df = pd.merge(df_GT, df_P, on=['index'], suffixes=('_grouth_truth', '_predicted')) | |
| # Create a multi-leveled | |
| merged_df = merged_df.groupby(['bucket_grouth_truth', 'bucket_predicted']).count() | |
| # https://stackoverflow.com/a/43921476/768439 | |
| # Convert multi-leveled pandas index into a 2d numpy array | |
| m, n = len(merged_df.index.levels[0]), len(merged_df.index.levels[1]) | |
| jp_matrix = merged_df.values.reshape(m, n) | |
| # Convert counts to percentages | |
| bin_size = (bins[1] - bins[0]) | |
| total = len(data_GT) # equal to len(data_P) | |
| axis = np.linspace(bin_size/2, bins[-1] + bin_size/2, len(bins))[:-1] | |
| jp_df = pd.DataFrame(jp_matrix, columns=axis, index=axis) | |
| jp_df = jp_df.applymap(lambda val: 0 if math.isnan(val) else (val / len(data_GT))) | |
| # Format the output table | |
| jp_df.columns.name = 'Predicted' | |
| jp_df.index.name = 'Ground Truth' | |
| formatted_df = jp_df.style.set_caption("Probability Distribution Matrix").apply(background_gradient, high=1, axis=None).format("{:.2f}") | |
| return formatted_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment