Skip to content

Instantly share code, notes, and snippets.

@DorkNstein
Last active August 30, 2017 17:49
Show Gist options
  • Save DorkNstein/37ba49798ab3fddde4d9f57b9503b6c1 to your computer and use it in GitHub Desktop.
Save DorkNstein/37ba49798ab3fddde4d9f57b9503b6c1 to your computer and use it in GitHub Desktop.
pip install pandas==0.19.2
pip install tensorflow==0.12.1
pip install numpy==1.12.0
pip install scikit-learn==0.18.1
pip install tqdm==4.11.2
pip install scipy==0.17.0
# You will also install tffm and run your code inside the main folder
# //git clone https://github.com/geffy/tffm.git
# //cd tffm Here’s a link to the full code of our example,
# //in case you want to skip the explanations.
# //In order to make full use of the categories and expanded historical data ,
# //we need to load and convert the data into the right format.
import pandas as pd
from collections import Counter
import tensorflow as tf
from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
# Loading datasets'
buys = open('yoochoose-buys.dat', 'r')
clicks = open('yoochoose-clicks.dat', 'r')
initial_buys_df = pd.read_csv(buys, names=['Session ID', 'Timestamp', 'Item ID', 'Category', 'Quantity'],
dtype={'Session ID': 'float32', 'Timestamp': 'str', 'Item ID': 'float32',
'Category': 'str'})
initial_buys_df.set_index('Session ID', inplace=True)
initial_clicks_df = pd.read_csv(clicks, names=['Session ID', 'Timestamp', 'Item ID', 'Category'],
dtype={'Category': 'str'})
initial_clicks_df.set_index('Session ID', inplace=True)
# We won't use timestamps in this example
initial_buys_df = initial_buys_df.drop('Timestamp', 1)
initial_clicks_df = initial_clicks_df.drop('Timestamp', 1)
# For illustrative purposes, we will only use a subset of the data: top 10000 buying users,
x = Counter(initial_buys_df.index).most_common(10000)
top_k = dict(x).keys()
initial_buys_df = initial_buys_df[initial_buys_df.index.isin(top_k)]
initial_clicks_df = initial_clicks_df[initial_clicks_df.index.isin(top_k)]
# Create a copy of the index, since we will also apply one-hot encoding on the index
initial_buys_df['_Session ID'] = initial_buys_df.index
# One-hot encode all columns for clicks and buys
transformed_buys = pd.get_dummies(initial_buys_df)
transformed_clicks = pd.get_dummies(initial_clicks_df)
# Aggregate historical data for Items and Categories
filtered_buys = transformed_buys.filter(regex="Item.*|Category.*")
filtered_clicks = transformed_clicks.filter(regex="Item.*|Category.*")
historical_buy_data = filtered_buys.groupby(filtered_buys.index).sum()
historical_buy_data = historical_buy_data.rename(columns=lambda column_name: 'buy history:' + column_name)
historical_click_data = filtered_clicks.groupby(filtered_clicks.index).sum()
historical_click_data = historical_click_data.rename(columns=lambda column_name: 'click history:' + column_name)
# Merge historical data of every user_id
merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True)
merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True)
# Create the MF model, you can play around with the parameters
model = TFFMRegressor(
order=2,
rank=7,
optimizer=tf.train.AdamOptimizer(learning_rate=0.1),
n_epochs=100,
batch_size=-1,
init_std=0.001,
input_type='dense'
)
merged2.drop(['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID'], 1, inplace=True)
X = np.array(merged2)
X = np.nan_to_num(X)
y = np.array(merged2['Quantity'].as_matrix())
# Split data into train, test
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)
#Split testing data in half: Full information vs Cold-start
X_te, X_te_cs, y_te, y_te_cs = train_test_split(X_te, y_te, test_size=0.5)
cold_start = pd.DataFrame(X_te_cs, columns=merged2.columns)
# What happens if we only have access to categories and no historical click/purchase data?
# Let's delete historical click and purchasing data for the cold_start test set
for column in cold_start.columns:
if ('buy' in column or 'click' in column) and ('Category' not in column):
cold_start[column] = 0
# Compute the mean squared error for both test sets
model.fit(X_tr, y_tr, show_progress=True)
predictions = model.predict(X_te)
cold_start_predictions = model.predict(X_te_cold)
print('MSE: {}'.format(mean_squared_error(y_te, predictions)))
print('Cold-start MSE: {}'.format(mean_squared_error(y_te_cold, predictions)))
model.destroy()
# Fun fact: Dropping the category columns in the training dataset makes the MSE even smaller
# but doing so means that we cannot tackle the cold-start recommendation problem
# Here are the MSE results:
Full Information MSE: 0.00644224764093
Cold-start MSE: 3.09448346431
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment