ljwolf · November 3, 2017 21:13
diff --git a/resample_markov.py b/resample_markov.py
 import scipy.stats as st
 import copy
 import numpy as np
 import pysal as ps

 def _estimate_kdes(y,z):
    """
    estimate a kernel desnity for each group in z (classes) from y (data)
    """
    z_unique = np.unique(z).tolist()
    distributions = []
    for i,zval in enumerate(z_unique):
        distributions.append(st.gaussian_kde(y[z==zval]))
    return distributions

 def forward_predict(y,z,P,local=True, interp=True, diff=True):
    """
    Predict a Markov chain foward using sub-distribution resampling. 

    y   :   np.ndarray
            a flat (n,) array containing the observed data in the time period to predict
    z   :   np.ndarray
            a flat (n,) array containing the classification of the data in the final time period
    P   :   np.ndarray
            a transition matrix estimated from the (y,z) chain. 
    local:  bool
            whether or not to divide the innovation distributions by z.
    interp: bool
            whether or not to sample from the kernel density estimate or the empirical distribution of innovations
    diff: bool
            whether or not to predict foward using the distribution of difference chain or the direct chain 

    NOTE: Innovations refer to the stochastic component added to y to move forward in time. Innovations can either be from the 
          differences between the current time period and the previous time period, or a resampling of the current time period. 
    """
    P = np.asarray(P)
    z_unique = np.unique(z).tolist()
    z_new = []
    y_new = []
    resample = _dispatch_experiment(y,z,local=True, interp=True, diff=False)
    for yi,zi in zip(y,z):
        zidx = z_unique.index(zi)
        draw = st.multinomial.rvs(1, P[zidx])
        zidx_new = draw.argmax()
        z_new.append(z_unique[zidx_new])
        y_new.append(resample(zidx_new))
    return np.vstack(y_new).squeeze(), np.vstack(z_new).squeeze(), resample 

 def _dispatch_experiment(data, classes, local=True, interp=True, diff=True):
    if diff:
        assert data.shape[1] == 2 #should be 2 columns with N rows
        to_resample = difference(data)
    else:
        to_resample = data
    resample_function = RejectionSampler(to_resample,classes,local=True, interp=interp,diff=diff)
    return resample_function

 class RejectionSampler(object):
    def __init__(self, data, classes, local=True, interp=True, diff=False):
        self.data = data
        self.classifier = ps.Quantiles(data, k=len(np.unique(classes)))
        self.is_local = local
        self.is_interp = interp
        self.is_diff = diff
        if self.is_local:
            self.unique_classes = np.unique(classes)
            self.classes = classes
        else:
            self.unique_classes = [1]
            self.classes = np.ones(data.shape)
        if self.is_interp:
            create_dist = lambda x: st.gaussian_kde(x)
        else:
            create_dist = lambda x: x
        self.distributions = [create_dist(data[classes==cls])
                              for cls in self.unique_classes]

    def __call__(self, next_class, this_obs=None):
        next_candidate_class = None
        while True:
            if self.is_local:
                next_ix = self.unique_classes.tolist().index(next_class)
            else:
                next_ix = 0
            if self.is_interp:
                candidate = self.distributions[next_ix].resample(1)
            else:
                candidate_set = copy.deepcopy(self.distributions[next_ix])
                starting_length = len(candidate_set)
                candidate_ix = np.random.randint(len(candidate_set))
                candidate = candidate_set[candidate_ix]
                self.distributions[next_ix] = np.delete(candidate_set, candidate_ix) #need a refresh?? if you generate unsuccessfully, this destructs
            if self.is_diff:
                assert (this_obs is not None), "If doing difference-sampling, the current observation must be provided."
                candidate = candidate + this_obs 
            next_candidate_class = self.classifier.find_bin(candidate)
            if next_candidate_class == next_class:
                return candidate
            if not self.is_interp:
                self.distributions[next_ix] = candidate_set
                assert starting_length == len(self.distributions[next_ix]), 'Your reset did not succeed!'

 if __name__ == '__main__':
    import pandas as pd
    import pysal as ps
    import matplotlib.pyplot as plt
    import seaborn.apionly as sns
    usjoin = pd.read_csv(ps.examples.get_path('usjoin.csv'))
    y = usjoin.iloc[:,2:].values
    pci = y.T
    rpci = (pci.T / pci.mean(axis=1))
    qpci = np.vstack([ps.Quantiles(year).yb for year in rpci])
    mkv = ps.Markov(qpci)
    
    ylast = rpci[:,-2]
    qlast = qpci[:,-2]

    y_local_kde, z_local_kde,rs_local_kde = forward_predict(ylast,qlast,mkv.p)
    y_global_kde, z_global_kde,rs_global_kde = forward_predict(ylast,qlast,mkv.p,
                                                     local=False)
    y_local_emp, z_local_emp,rs_local_emp = forward_predict(ylast,qlast,mkv.p,
                                                     interp=False)
    y_global_emp, z_global_emp,rs_global_emp = forward_predict(ylast,qlast,mkv.p,
                                                     local=False, interp=False)
    y_local_kde_diff, z_local_kde_diff,rs_local_kde_diff = forward_predict(ylast,qlast,mkv.p,
                                                     diff=True)
    y_global_kde_diff, z_local_kde_diff,rs_local_kde_diff= forward_predict(ylast,qlast,mkv.p,
                                                     local=False, interp=False, 
                                                     diff=True)
diff --git a/resampling.ipynb b/resampling.ipynb
	import scipy.stats as st
	import copy
	import numpy as np
	import pysal as ps

	def _estimate_kdes(y,z):
	"""
	estimate a kernel desnity for each group in z (classes) from y (data)
	"""
	z_unique = np.unique(z).tolist()
	distributions = []
	for i,zval in enumerate(z_unique):
	distributions.append(st.gaussian_kde(y[z==zval]))
	return distributions

	def forward_predict(y,z,P,local=True, interp=True, diff=True):
	"""
	Predict a Markov chain foward using sub-distribution resampling.

	y : np.ndarray
	a flat (n,) array containing the observed data in the time period to predict
	z : np.ndarray
	a flat (n,) array containing the classification of the data in the final time period
	P : np.ndarray
	a transition matrix estimated from the (y,z) chain.
	local: bool
	whether or not to divide the innovation distributions by z.
	interp: bool
	whether or not to sample from the kernel density estimate or the empirical distribution of innovations
	diff: bool
	whether or not to predict foward using the distribution of difference chain or the direct chain

	NOTE: Innovations refer to the stochastic component added to y to move forward in time. Innovations can either be from the
	differences between the current time period and the previous time period, or a resampling of the current time period.
	"""
	P = np.asarray(P)
	z_unique = np.unique(z).tolist()
	z_new = []
	y_new = []
	resample = _dispatch_experiment(y,z,local=True, interp=True, diff=False)
	for yi,zi in zip(y,z):
	zidx = z_unique.index(zi)
	draw = st.multinomial.rvs(1, P[zidx])
	zidx_new = draw.argmax()
	z_new.append(z_unique[zidx_new])
	y_new.append(resample(zidx_new))
	return np.vstack(y_new).squeeze(), np.vstack(z_new).squeeze(), resample

	def _dispatch_experiment(data, classes, local=True, interp=True, diff=True):
	if diff:
	assert data.shape[1] == 2 #should be 2 columns with N rows
	to_resample = difference(data)
	else:
	to_resample = data
	resample_function = RejectionSampler(to_resample,classes,local=True, interp=interp,diff=diff)
	return resample_function

	class RejectionSampler(object):
	def __init__(self, data, classes, local=True, interp=True, diff=False):
	self.data = data
	self.classifier = ps.Quantiles(data, k=len(np.unique(classes)))
	self.is_local = local
	self.is_interp = interp
	self.is_diff = diff
	if self.is_local:
	self.unique_classes = np.unique(classes)
	self.classes = classes
	else:
	self.unique_classes = [1]
	self.classes = np.ones(data.shape)
	if self.is_interp:
	create_dist = lambda x: st.gaussian_kde(x)
	else:
	create_dist = lambda x: x
	self.distributions = [create_dist(data[classes==cls])
	for cls in self.unique_classes]

	def __call__(self, next_class, this_obs=None):
	next_candidate_class = None
	while True:
	if self.is_local:
	next_ix = self.unique_classes.tolist().index(next_class)
	else:
	next_ix = 0
	if self.is_interp:
	candidate = self.distributions[next_ix].resample(1)
	else:
	candidate_set = copy.deepcopy(self.distributions[next_ix])
	starting_length = len(candidate_set)
	candidate_ix = np.random.randint(len(candidate_set))
	candidate = candidate_set[candidate_ix]
	self.distributions[next_ix] = np.delete(candidate_set, candidate_ix) #need a refresh?? if you generate unsuccessfully, this destructs
	if self.is_diff:
	assert (this_obs is not None), "If doing difference-sampling, the current observation must be provided."
	candidate = candidate + this_obs
	next_candidate_class = self.classifier.find_bin(candidate)
	if next_candidate_class == next_class:
	return candidate
	if not self.is_interp:
	self.distributions[next_ix] = candidate_set
	assert starting_length == len(self.distributions[next_ix]), 'Your reset did not succeed!'

	if __name__ == '__main__':
	import pandas as pd
	import pysal as ps
	import matplotlib.pyplot as plt
	import seaborn.apionly as sns
	usjoin = pd.read_csv(ps.examples.get_path('usjoin.csv'))
	y = usjoin.iloc[:,2:].values
	pci = y.T
	rpci = (pci.T / pci.mean(axis=1))
	qpci = np.vstack([ps.Quantiles(year).yb for year in rpci])
	mkv = ps.Markov(qpci)

	ylast = rpci[:,-2]
	qlast = qpci[:,-2]

	y_local_kde, z_local_kde,rs_local_kde = forward_predict(ylast,qlast,mkv.p)
	y_global_kde, z_global_kde,rs_global_kde = forward_predict(ylast,qlast,mkv.p,
	local=False)
	y_local_emp, z_local_emp,rs_local_emp = forward_predict(ylast,qlast,mkv.p,
	interp=False)
	y_global_emp, z_global_emp,rs_global_emp = forward_predict(ylast,qlast,mkv.p,
	local=False, interp=False)
	y_local_kde_diff, z_local_kde_diff,rs_local_kde_diff = forward_predict(ylast,qlast,mkv.p,
	diff=True)
	y_global_kde_diff, z_local_kde_diff,rs_local_kde_diff= forward_predict(ylast,qlast,mkv.p,
	local=False, interp=False,
	diff=True)