Created
December 19, 2018 09:27
-
-
Save catalystfrank/58ca4fc7e123cb146560f9c738b1fb4b to your computer and use it in GitHub Desktop.
protein fold
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import numpy.random as nr | |
# Read In | |
DF = pd.read_csv('train.csv',sep=',',header=0) | |
for i in xrange(28): | |
DF[str(i)] = DF['Target'].map(lambda x: int(str(i) in x.split(' '))) | |
value_counts = DF.ix[:,2:].apply(np.sum, axis=0) | |
class_order = list(value_counts.sort_values().index) | |
DF['inpool'] = 0 | |
# Config Fold Num | |
Nfold = 5 | |
# Arrange Fold From Least Labelled Class | |
for item in class_order: | |
#print item, | |
value_counts[item] | |
origin_pool = [value_counts[item]/Nfold for i in range(Nfold)] | |
randadd = nr.choice(Nfold, value_counts[item]%Nfold) | |
while len(randadd)!=len(set(randadd)): | |
randadd = nr.choice(Nfold, value_counts[item]%Nfold) | |
for index in randadd: | |
origin_pool[index] = origin_pool[index]+1 | |
existing_pool = DF[DF[item]==1].groupby('inpool').count().ix[:,0].to_dict() | |
for i in range(Nfold): | |
if i+1 not in existing_pool: | |
existing_pool[i+1] = 0 | |
waiting_pool = [each for each in origin_pool] | |
for i in range(Nfold): | |
if waiting_pool[i]<existing_pool[i+1]: | |
print "Error Allocating Label: Could Not Balance" | |
waiting_pool[i] = waiting_pool[i] - existing_pool[i+1] | |
unperturb_label = [pool_id+1 for pool_id in range(Nfold) for each in range(waiting_pool[pool_id]) ] | |
perturb_label = nr.permutation(unperturb_label) | |
#print perturb_label, | |
#print len(perturb_label), | |
#print len(DF[(DF['inpool']==0) & (DF[item]==1)]['inpool']) | |
for index,dfindex in enumerate(DF[(DF['inpool']==0) & (DF[item]==1)]['inpool'].index): | |
DF.ix[dfindex,'inpool'] = perturb_label[index] | |
# Check Balancing | |
DF.groupby('inpool').sum() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment