AruniRC · June 19, 2020 17:19
diff --git a/histogram_spec_map.py b/histogram_spec_map.py
 import os
 import sys
 import pickle
 import json
 import numpy as np
 import sys
 import matplotlib 
 matplotlib.use('Agg')
 from matplotlib import pyplot as plt
 import os.path as osp

 DEBUG = True

 def bin_scores(scores, score_bins):
    # bins[i-1] <= x < bins[i]
    scores_quantized = np.digitize(scores, score_bins) # bin indices
    return score_bins[scores_quantized] # bin values


 def match_histograms(source, template):
    """
    Histogram of source data matches that of template data

    Arguments:
    -----------
        source: np.ndarray (1-D) after bin quantization
        template: np.ndarray (1-D) after bin quantization

    Returns:
    -----------
        matched: np.ndarray
            The transformed source data
    """

    # get unique values, indices and counts
    s_val, bin_idx, s_counts = np.unique(source, return_inverse=True, 
                                         return_counts=True)
    t_val, t_counts = np.unique(template, return_counts=True)

    # calculate empirical CDFs
    s_cdf = np.cumsum(s_counts).astype(np.float64)
    s_cdf /= s_cdf[-1]
    t_cdf = np.cumsum(t_counts).astype(np.float64)
    t_cdf /= t_cdf[-1]

    # mapping: values in template's CDF closest to source's CDF
    interp_t_val = np.interp(s_cdf, t_cdf, t_val)

    # modify source values to match closest template CDF
    source_matched = interp_t_val[bin_idx]

    if DEBUG:
        # plt.ylim([0,0.25])
        sm_val, sm_counts = np.unique(source_matched, return_counts=True)
        sm_cdf = np.cumsum(sm_counts).astype(np.float64)
        sm_cdf /= sm_cdf[-1]

        fig_path =  osp.join('hist_src_target-DEBUG.png')
        plt.plot(s_val, s_cdf, label='CDF source', alpha=0.8)
        plt.plot(t_val, t_cdf, label='CDF target', alpha=0.8)
        plt.plot(sm_val, sm_cdf, label='CDF source-matched', alpha=0.8)
        plt.title('Score histograms')
        plt.grid()
        plt.legend()
        out_dir = os.path.dirname(fig_path)
        if not osp.exists(out_dir):
            os.makedirs(out_dir, exist_ok=True)
        plt.savefig(fig_path, bbox_inches='tight')
        print('Score histogram saved at: %s' % fig_path)
        plt.close()

        fig_path =  osp.join('hist_map-DEBUG.png')
        plt.plot(source, source_matched, 'bo', label='Map source')
        plt.title('Score mapping')
        plt.xlabel('Source scores')
        plt.ylabel('Mapped scores')
        plt.grid()
        plt.legend()
        out_dir = os.path.dirname(fig_path)
        if not osp.exists(out_dir):
            os.makedirs(out_dir, exist_ok=True)
        plt.savefig(fig_path, bbox_inches='tight')
        print('Score mapping saved at: %s' % fig_path)
        plt.close()
    return source_matched
  
  


 if __name__ == '__main__': 
    # TESTING
    # create some dummy data
    data_unif = np.random.randint(0, 10, 1000)
    data_normal = np.random.normal(5, 5, 5000)
    data_normal = np.abs(data_normal.astype(np.int64))
    unif_to_normal = match_histograms(data_unif, data_normal)
	import os
	import sys
	import pickle
	import json
	import numpy as np
	import sys
	import matplotlib
	matplotlib.use('Agg')
	from matplotlib import pyplot as plt
	import os.path as osp

	DEBUG = True

	def bin_scores(scores, score_bins):
	# bins[i-1] <= x < bins[i]
	scores_quantized = np.digitize(scores, score_bins) # bin indices
	return score_bins[scores_quantized] # bin values


	def match_histograms(source, template):
	"""
	Histogram of source data matches that of template data

	Arguments:
	-----------
	source: np.ndarray (1-D) after bin quantization
	template: np.ndarray (1-D) after bin quantization

	Returns:
	-----------
	matched: np.ndarray
	The transformed source data
	"""

	# get unique values, indices and counts
	s_val, bin_idx, s_counts = np.unique(source, return_inverse=True,
	return_counts=True)
	t_val, t_counts = np.unique(template, return_counts=True)

	# calculate empirical CDFs
	s_cdf = np.cumsum(s_counts).astype(np.float64)
	s_cdf /= s_cdf[-1]
	t_cdf = np.cumsum(t_counts).astype(np.float64)
	t_cdf /= t_cdf[-1]

	# mapping: values in template's CDF closest to source's CDF
	interp_t_val = np.interp(s_cdf, t_cdf, t_val)

	# modify source values to match closest template CDF
	source_matched = interp_t_val[bin_idx]

	if DEBUG:
	# plt.ylim([0,0.25])
	sm_val, sm_counts = np.unique(source_matched, return_counts=True)
	sm_cdf = np.cumsum(sm_counts).astype(np.float64)
	sm_cdf /= sm_cdf[-1]

	fig_path = osp.join('hist_src_target-DEBUG.png')
	plt.plot(s_val, s_cdf, label='CDF source', alpha=0.8)
	plt.plot(t_val, t_cdf, label='CDF target', alpha=0.8)
	plt.plot(sm_val, sm_cdf, label='CDF source-matched', alpha=0.8)
	plt.title('Score histograms')
	plt.grid()
	plt.legend()
	out_dir = os.path.dirname(fig_path)
	if not osp.exists(out_dir):
	os.makedirs(out_dir, exist_ok=True)
	plt.savefig(fig_path, bbox_inches='tight')
	print('Score histogram saved at: %s' % fig_path)
	plt.close()

	fig_path = osp.join('hist_map-DEBUG.png')
	plt.plot(source, source_matched, 'bo', label='Map source')
	plt.title('Score mapping')
	plt.xlabel('Source scores')
	plt.ylabel('Mapped scores')
	plt.grid()
	plt.legend()
	out_dir = os.path.dirname(fig_path)
	if not osp.exists(out_dir):
	os.makedirs(out_dir, exist_ok=True)
	plt.savefig(fig_path, bbox_inches='tight')
	print('Score mapping saved at: %s' % fig_path)
	plt.close()
	return source_matched




	if __name__ == '__main__':
	# TESTING
	# create some dummy data
	data_unif = np.random.randint(0, 10, 1000)
	data_normal = np.random.normal(5, 5, 5000)
	data_normal = np.abs(data_normal.astype(np.int64))
	unif_to_normal = match_histograms(data_unif, data_normal)