Created
February 16, 2019 14:05
-
-
Save jjone36/a032cde755fbc967259eb8b2370a1503 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create a function to vetorize all the ingredients and get t-SNE at once | |
def cosmetic_map(option_1, option_2): | |
''' Define a function creating a dataframe for each option ''' | |
df = cosm[cosm['Label'] == option_1][cosm[option_2] == 1] | |
df = df.reset_index() | |
# embedding each ingredients | |
word_index_map = {} | |
index_word_map = [] | |
current_index = 0 | |
corpus = [] | |
for i in range(len(df)): | |
text = df['Ingredients'][i] | |
text = text.lower() | |
tokens = text.split(', ') | |
corpus.append(tokens) | |
for token in tokens: | |
if token not in word_index_map: | |
word_index_map[token] = current_index | |
current_index += 1 | |
index_word_map.append(token) | |
# creating dtm matrix | |
D = len(corpus) # number of items | |
N = len(word_index_map) # total number of ingredients | |
A = np.zeros((D, N)) | |
def tokens_to_vector(tokens): | |
x = np.zeros(len(word_index_map)) | |
for token in tokens: | |
i = word_index_map[token] | |
x[i] = 1 | |
return x | |
i = 0 | |
for tokens in corpus: | |
A[i, :] = tokens_to_vector(tokens) | |
i += 1 | |
# decomposition using t-SNE | |
tsne = TSNE(learning_rate = 200) | |
tsne_features = tsne.fit_transform(A) | |
df['X'] = tsne_features[:, 0] | |
df['Y'] = tsne_features[:, 1] | |
return df | |
# Apply the function to all combination of the two options. | |
df_all = pd.DataFrame() | |
for a in option_1: | |
for b in option_2: | |
temp = cosmetic_map(a, b) | |
temp['Label'] = a + '_' + b | |
df_all = pd.concat([df, temp], axis = 0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment