Created
January 9, 2019 09:59
-
-
Save dmitrysarov/1d492519ec03a482813928b204adf67f to your computer and use it in GitHub Desktop.
It could help when your data set not balanced over one of parameters, but you have to split it over equally sized folds
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#iteratively fill fold with balancing probobilities of next item respectivly it size | |
#df - dataframe of subjects CT slices, different subjects can have different number of slices | |
#folds splits can't have same subject in it | |
value_count = df['subject'].value_counts() #count how much each subject have slices | |
number_of_folds = 5 | |
np.random.seed(42) | |
for num, chunk in enumerate(np.array_split(value_count, range(number_of_folds, len(value_count), number_of_folds))): | |
# chunk - is sorted by count | |
if num == 0: | |
#initialize folds | |
rand_int = list(range(number_of_folds)) | |
np.random.shuffle(rand_int) | |
folds = {i: rand_int.pop() for i in chunk.index} | |
folds_count = {folds[sbj]:chunk[sbj] for sbj in folds} # folds[sbj] - number of fold, chunk[sbj] - number of subject slices | |
weights = {f:1/c for f,c in folds_count.items()} | |
#normalize | |
weights = {k: v/sum(weights.values()) for k, v in weights.items()} | |
else: | |
#select fold number with respect to current fold "weight" | |
rand_int = np.random.choice(list(weights.keys()), size=len(chunk), p=list(weights.values()), replace=False).tolist() | |
local_folds = {i: rand_int.pop() for i in chunk.index} | |
folds.update(local_folds) #folds for current chunk | |
local_folds_count = {folds[sbj]:chunk[sbj]+folds_count[folds[sbj]] for sbj in local_folds} #add count for each fold from prev step | |
folds_count = local_folds_count | |
weights = {f:1/c for f,c in folds_count.items()} | |
#normalize | |
weights = {k: v/sum(weights.values()) for k, v in weights.items()} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment