Skip to content

Instantly share code, notes, and snippets.

@analyticsindiamagazine
Created November 26, 2019 06:31
Show Gist options
  • Save analyticsindiamagazine/1009e748698e1099ab44022966f64c97 to your computer and use it in GitHub Desktop.
Save analyticsindiamagazine/1009e748698e1099ab44022966f64c97 to your computer and use it in GitHub Desktop.
#A function to find the maximun number of features in a single cell
def max_features_in_single_row(train, test, delimiter):
max_info = 0
item_lis = list(train.append(test))
for i in item_lis:
if len(i.split("{}".format(delimiter))) > max_info:
max_info = len(i.split("{}".format(delimiter)))
print("\n","-"*35)
print("Max_Features in One Observation = ", max_info)
return max_info
#This function splits a column in to n features where n is the maximum number of features in a single cell
def feature_splitter(feat, name, delimiter, max_info):
item_lis = list(feat)
extracted_features = {}
for i in range(max_info):
extracted_features['{}_Feature_{}'.format(name, i+1)] = []
print("-"*35)
print("Features Dictionary : ", extracted_features)
#tqdm is a graphics module that helps us see the progress bar
for i in tqdm(range(len(item_lis))):
for j in range(max_info):
try:
extracted_features['{}_Feature_{}'.format(name,j+1)].append(item_lis[i].split("{}".format(delimiter))[j].lower().strip())
except:
extracted_features['{}_Feature_{}'.format(name, j+1)].append(np.nan)
return extracted_features
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment