Last active
September 16, 2020 14:51
-
-
Save tanveer-sayyed/e49fc188e25d76f86df8a19874439b91 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
valueCounts = {} | |
def CountAll(): | |
global all_columns, nanCounts, valueCounts | |
all_columns = list(df) | |
nanCounts = df.isnull().sum() | |
for x in all_columns: | |
valueCounts[x] = df[x].value_counts() | |
"""Random but proportional replacement(RBPR) of numeric""" | |
def Fill_NaNs_Numeric(col): | |
mini = df[col].min() | |
maxi = df[col].max() | |
"""Selecting ONLY non-NaNs.""" | |
temp = df[df[col].notnull()][col] # type --> pd.Series | |
"""Any continuous data is 'always' divided into 45 bins (Hard-Coded).""" | |
bin_size = 45 | |
bins = np.linspace(mini, maxi, bin_size) | |
"""Filling the bins (with non-NaNs) and calculating mean of each bin.""" | |
non_NaNs_per_bin = [] | |
mean_of_bins = [] | |
non_NaNs_per_bin.append(len(temp[(temp <= bins[0])])) | |
mean_of_bins.append(temp[(temp <= bins[0])].mean()) | |
for x in range(1, bin_size): | |
non_NaNs_per_bin.append(len(temp[(temp <= bins[x]) & (temp > bins[x-1])])) | |
mean_of_bins.append(temp[(temp <= bins[x]) & (temp > bins[x-1])].mean()) | |
mean_of_bins = pd.Series(mean_of_bins) | |
# np.around() on list 'proportion' may create trouble and we may get a zero-value imputed, hence, | |
mean_of_bins.fillna(temp.mean(), inplace= True) | |
non_NaNs_per_bin = np.array(non_NaNs_per_bin) | |
"""Followoing part is SAME as Fill_NaNs_Catigorical()""" | |
"""Calculating probability and expected value.""" | |
proportion = np.array(non_NaNs_per_bin) / valueCounts[col].sum() * nanCounts[col] | |
proportion = np.around(proportion).astype('int') | |
"""Adjusting proportion.""" | |
diff = int(nanCounts[col] - np.sum(proportion)) | |
if diff > 0: | |
for x in range(diff): | |
idx = random.randint(0, len(proportion) - 1) | |
proportion[idx] = proportion[idx] + 1 | |
else: | |
diff = -diff | |
while(diff != 0): | |
idx = random.randint(0, len(proportion) - 1) | |
if proportion[idx] > 0: | |
proportion[idx] = proportion[idx] - 1 | |
diff = diff - 1 | |
"""Filling NaNs.""" | |
nan_indexes = df[df[col].isnull()].index.tolist() | |
for x in range(len(proportion)): | |
if proportion[x] > 0: | |
random_subset = random.sample(population= nan_indexes, k= proportion[x]) | |
df.loc[random_subset, col] = mean_of_bins[x] # <--- Replacing with bin mean | |
nan_indexes = list(set(nan_indexes) - set(random_subset)) | |
"""-------------------------------------------------------------------------""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello @abcorep, thank you for taking out the time to read the article. Understanding the mathematics part is actually simple. Assume there are 2 boys and 3 girls; now you have to distribute 10 chocolates to them? One answer is distributing 2 chocolates to each. But with a different perspective we can also have another solution - where in we distribute chocolates in proportion to the gender:
(no. of girls)/(total children) = 3/5 --> this the ratio of all girls to total children.
So how many chocolates go to only girls? --> (3/5)*10 = 6
And how many chocolates go to only boys? --> (2/5)*10 = 4
Thus 10(number of chocolates) has been split into 6:4 which is actually 3:2, that is ratio of girls to boys. Hope this resolves your doubt.