Skip to content

Instantly share code, notes, and snippets.

@rnowling
Created December 16, 2015 01:45
Show Gist options
  • Save rnowling/bab32a99ea0cb9575dee to your computer and use it in GitHub Desktop.
Save rnowling/bab32a99ea0cb9575dee to your computer and use it in GitHub Desktop.
Imputing Missing Data and Random Forest Variable Importance Scores
from collections import defaultdict
import random
import sys
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import mstats
N_SAMPLES = 100
N_TREES = 100
N_SIMS = 100
HETERO_PROB = [0.0, 0.0, 0.0, 0.25, 0.25, 0.25, 0.50, 0.50, 0.50]
N_UNCORRELATED = 3
MISSING_DATA = [0, 10, 20]
def generate_data(n_samples, hetero_probability, n_uncorrelated):
# assume SNPs are biallelic, features are counts of each allele
# for each of the 3 genotypes (A/A, A/T, T/T), hetero prob is given
# and the homo are (1 - hetero) / 2
n_snps = len(hetero_probability) + n_uncorrelated
n_features = 2 * n_snps
features = np.zeros((n_samples, n_features))
labels = np.zeros(n_samples)
for i in xrange(n_samples):
labels[i] = random.randint(0, 1)
feature_idx = 0
for j in xrange(n_uncorrelated):
allele = random.randint(0, 2)
if allele == 0:
features[i, feature_idx] = 2
elif allele == 1:
features[i, feature_idx] = 1
features[i, feature_idx + 1] = 1
else:
features[i, feature_idx + 1] = 2
feature_idx += 2
for p in hetero_probability:
r = random.random()
if r < p:
features[i, feature_idx] = 1
features[i, feature_idx + 1] = 1
else:
features[i, feature_idx + labels[i]] = 2
feature_idx += 2
return features, labels
def zero_out(X, missing_data, labels, impute_mode):
reverse_labels = defaultdict(list)
for i, label in enumerate(labels):
reverse_labels[label].append(i)
for snp_idx in xrange(X.shape[1] / 2):
miss_individuals = random.sample(range(X.shape[0]), missing_data[snp_idx % len(missing_data)])
if impute_mode == "zero":
# zero out
for individual_idx in miss_individuals:
X[individual_idx, snp_idx * 2] = 0
X[individual_idx, snp_idx * 2 + 1] = 0
elif impute_mode == "mode":
print "Imputing via mode"
modes = dict()
for label, individuals in reverse_labels.iteritems():
alleles = []
for individual_idx in individuals:
if individual_idx in miss_individuals:
continue
if X[individual_idx, snp_idx * 2] == 2:
alleles.append(2)
elif X[individual_idx, snp_idx * 2] == 1:
alleles.append(1)
elif X[individual_idx, snp_idx * 2 + 1] == 2:
alleles.append(0)
modes[label] = int(mstats.mode(alleles)[0])
print modes
for individual_idx in miss_individuals:
label = labels[individual_idx]
mode = modes[label]
if mode == 2:
print X[individual_idx, snp_idx * 2], X[individual_idx, snp_idx * 2 + 1]
print 2, 0
X[individual_idx, snp_idx * 2] == 2.0
X[individual_idx, snp_idx * 2 + 1] == 0.0
elif mode == 1:
print X[individual_idx, snp_idx * 2], X[individual_idx, snp_idx * 2 + 1]
print 1, 1
X[individual_idx, snp_idx * 2] == 1.0
X[individual_idx, snp_idx * 2 + 1] == 1.0
else:
print X[individual_idx, snp_idx * 2], X[individual_idx, snp_idx * 2 + 1]
print 0, 2
X[individual_idx, snp_idx * 2] == 0.0
X[individual_idx, snp_idx * 2 + 1] == 2.0
if impute_mode == "sample":
print "Imputing via sampling"
alleles = defaultdict(list)
for label, individuals in reverse_labels.iteritems():
for individual_idx in individuals:
if individual_idx in miss_individuals:
continue
if X[individual_idx, snp_idx * 2] == 2:
alleles[label].append(2)
elif X[individual_idx, snp_idx * 2] == 1:
alleles[label].append(1)
elif X[individual_idx, snp_idx * 2 + 1] == 2:
alleles[label].append(0)
for individual_idx in miss_individuals:
label = labels[individual_idx]
allele = random.choice(alleles[label])
if allele == 2:
print X[individual_idx, snp_idx * 2], X[individual_idx, snp_idx * 2 + 1]
print 2, 0
X[individual_idx, snp_idx * 2] == 2.0
X[individual_idx, snp_idx * 2 + 1] == 0.0
elif allele == 1:
print X[individual_idx, snp_idx * 2], X[individual_idx, snp_idx * 2 + 1]
print 1, 1
X[individual_idx, snp_idx * 2] == 1.0
X[individual_idx, snp_idx * 2 + 1] == 1.0
else:
print X[individual_idx, snp_idx * 2], X[individual_idx, snp_idx * 2 + 1]
print 0, 2
X[individual_idx, snp_idx * 2] == 0.0
X[individual_idx, snp_idx * 2 + 1] == 2.0
def plot_variable_importances(flname, variable_importances, title):
plt.clf()
plt.boxplot(x=variable_importances)
plt.xlabel("Variable", fontsize=16)
plt.ylabel("Gini Importance", fontsize=16)
plt.title(title, fontsize=18)
plt.ylim([0.0, 1.0])
plt.savefig(flname, DPI=200)
if __name__ == "__main__":
# burn in
for i in xrange(100):
random.random()
n_snps = len(HETERO_PROB) + N_UNCORRELATED
n_features = 2 * n_snps
variable_importances = [[] for i in xrange(n_features)]
for i in xrange(N_SIMS):
print "Round", i
X, y = generate_data(N_SAMPLES, HETERO_PROB, N_UNCORRELATED)
rf = RandomForestClassifier(n_estimators = N_TREES)
rf.fit(X, y)
feature_importances = rf.feature_importances_
for i in xrange(len(feature_importances)):
variable_importances[i].append(feature_importances[i])
plot_variable_importances("figures/rf_missing_data_none.png", variable_importances, "No Missing Data")
variable_importances = [[] for i in xrange(n_features)]
for i in xrange(N_SIMS):
print "Round", i
X, y = generate_data(N_SAMPLES, HETERO_PROB, N_UNCORRELATED)
zero_out(X, MISSING_DATA, y, "zero")
rf = RandomForestClassifier(n_estimators = N_TREES)
rf.fit(X, y)
feature_importances = rf.feature_importances_
for i in xrange(len(feature_importances)):
variable_importances[i].append(feature_importances[i])
plot_variable_importances("figures/rf_missing_data.png", variable_importances, "Missing Data")
variable_importances = [[] for i in xrange(n_features)]
for i in xrange(N_SIMS):
print "Round", i
X, y = generate_data(N_SAMPLES, HETERO_PROB, N_UNCORRELATED)
zero_out(X, MISSING_DATA, y, "mode")
rf = RandomForestClassifier(n_estimators = N_TREES)
rf.fit(X, y)
feature_importances = rf.feature_importances_
for i in xrange(len(feature_importances)):
variable_importances[i].append(feature_importances[i])
plot_variable_importances("figures/rf_missing_data_imputed_mode.png", variable_importances, "Missing Data Imputed with Mode")
variable_importances = [[] for i in xrange(n_features)]
for i in xrange(N_SIMS):
print "Round", i
X, y = generate_data(N_SAMPLES, HETERO_PROB, N_UNCORRELATED)
zero_out(X, MISSING_DATA, y, "sample")
rf = RandomForestClassifier(n_estimators = N_TREES)
rf.fit(X, y)
feature_importances = rf.feature_importances_
for i in xrange(len(feature_importances)):
variable_importances[i].append(feature_importances[i])
plot_variable_importances("figures/rf_missing_data_imputed_sampling.png", variable_importances, "Missing Data Imputed with Sampling")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment