Last active
August 29, 2015 14:14
-
-
Save myui/79e06f47562bc8fd1928 to your computer and use it in GitHub Desktop.
HivemallでMatrix Factorization ref: http://qiita.com/myui/items/dccb4f58799f080e24ab
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
R \approx P^{T}Q |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
R^{'}_{ui} = \mu + B_{u} + B_{i} + \overrightarrow{P_{u}}^{T}\overrightarrow{Q_{i}} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select | |
t2.actual, | |
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted | |
from ( | |
select | |
t1.userid, | |
t1.movieid, | |
t1.rating as actual, | |
p1.Pu, | |
p1.Bu | |
from | |
testing t1 LEFT OUTER JOIN sgd_model p1 | |
ON (t1.userid = p1.idx) | |
) t2 | |
LEFT OUTER JOIN sgd_model p2 | |
ON (t2.movieid = p2.idx); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select | |
mae(predicted, actual) as mae, | |
rmse(predicted, actual) as rmse | |
from ( | |
select | |
t2.actual, | |
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted | |
from ( | |
select | |
t1.userid, | |
t1.movieid, | |
t1.rating as actual, | |
p1.Pu, | |
p1.Bu | |
from | |
testing t1 LEFT OUTER JOIN sgd_model p1 | |
ON (t1.userid = p1.idx) | |
) t2 | |
LEFT OUTER JOIN sgd_model p2 | |
ON (t2.movieid = p2.idx) | |
) t; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set hivevar:userid=1; | |
set hivevar:topk=5; | |
select | |
t1.movieid, | |
mf_predict(t2.Pu, t1.Qi, t2.Bu, t1.Bi, ${mu}) as predicted | |
from ( | |
select | |
idx as movieid, | |
Qi, | |
Bi | |
from | |
sgd_model p | |
where | |
p.idx NOT IN | |
(select movieid from training where userid=${userid}) | |
) t1 CROSS JOIN ( | |
select | |
Pu, | |
Bu | |
from | |
sgd_model | |
where | |
idx = ${userid} | |
) t2 | |
order by | |
predicted DESC | |
limit ${topk}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select | |
mae(predicted, actual) as mae, | |
rmse(predicted, actual) as rmse | |
from ( | |
select | |
t2.actual, | |
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted | |
from ( | |
select | |
t1.userid, | |
t1.movieid, | |
t1.rating as actual, | |
p1.Pu, | |
p1.Bu | |
from | |
testing t1 LEFT OUTER JOIN sgd_model p1 | |
ON (t1.userid = p1.idx) | |
) t2 | |
LEFT OUTER JOIN sgd_model p2 | |
ON (t2.movieid = p2.idx) | |
) t; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
\mathrm{MAE} = \frac{1}{n}\sum_{i=1}^n \left| f_i-y_i\right| =\frac{1}{n}\sum_{i=1}^n \left| e_i \right| |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set hivevar:kfold=10; | |
set hivevar:seed=31; | |
-- Adding group id (gid) to each training instance | |
drop table ratings_groupded; | |
create table ratings_groupded | |
as | |
select | |
rand_gid2(${kfold}, ${seed}) gid, -- generates group id ranging from 1 to 10 | |
userid, | |
movieid, | |
rating | |
from | |
ratings | |
cluster by gid, rand(${seed}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
\mathrm{MAE} = \frac{1}{n}\sum_{i=1}^n \left| f_i-y_i\right| =\frac{1}{n}\sum_{i=1}^n \left| e_i \right| |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set hivevar:kfold=10; | |
set hivevar:seed=31; | |
-- Adding group id (gid) to each training instance | |
drop table ratings_groupded; | |
create table ratings_groupded | |
as | |
select | |
rand_gid2(${kfold}, ${seed}) gid, -- generates group id ranging from 1 to 10 | |
userid, | |
movieid, | |
rating | |
from | |
ratings | |
cluster by gid, rand(${seed}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- latent factors | |
set hivevar:factor=10; | |
-- maximum number of iterations | |
set hivevar:iters=50; | |
-- regularization parameter | |
set hivevar:lambda=0.05; | |
-- learning rate | |
set hivevar:eta=0.005; | |
-- conversion rate (if changes between iterations became less or equals to ${cv_rate}, the training will stop) | |
set hivevar:cv_rate=0.001; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select avg(rating) from ratings; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- mean rating value (Optional but recommended to set ${mu}) | |
set hivevar:mu=3.581564453029317; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
min_{P,Q,B} \sum_{(u,i) \in R}{(R_{ui}-R^{'}_{ui})^{2} + \lambda ({||B_{u}||}^{2} + {||B_{i}||}^{2} + {||\overrightarrow{P_{u}}||}^{2} + {||\overrightarrow{Q_{i}}||}^{2}}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create database movielens; | |
use movielens; | |
CREATE EXTERNAL TABLE ratings ( | |
userid INT, | |
movieid INT, | |
rating INT, | |
tstamp STRING | |
) ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY '#' | |
STORED AS TEXTFILE | |
LOCATION '/dataset/movielens/ratings'; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SET hivevar:seed=31; | |
CREATE TABLE ratings2 | |
as | |
select | |
rand(${seed}) as rnd, | |
userid, | |
movieid, | |
rating | |
from | |
ratings; | |
CREATE TABLE training | |
as | |
select * from ratings2 | |
order by rnd DESC | |
limit 800000; | |
CREATE TABLE testing | |
as | |
select * from ratings2 | |
order by rnd ASC | |
limit 200209; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sed 's/::/#/g' ratings.dat | hadoop fs -put - /dataset/movielens/ratings/ratings.t |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SET hivevar:seed=31; | |
CREATE TABLE ratings2 | |
as | |
select | |
rand(${seed}) as rnd, | |
userid, | |
movieid, | |
rating | |
from | |
ratings; | |
CREATE TABLE training | |
as | |
select * from ratings2 | |
order by rnd DESC | |
limit 800000; | |
CREATE TABLE testing | |
as | |
select * from ratings2 | |
order by rnd ASC | |
limit 200209; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- 1) mean rating | |
set hivevar:mu=3.593565; | |
-- 2) number of factors | |
set hivevar:factor=10; | |
-- 3) maximum number of training iterations | |
set hivevar:iters=50; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- 1) mean rating | |
set hivevar:mu=3.593565; | |
-- 2) number of factors | |
set hivevar:factor=10; | |
-- 3) maximum number of training iterations | |
set hivevar:iters=50; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create table sgd_model | |
as | |
select | |
idx, | |
array_avg(u_rank) as Pu, | |
array_avg(m_rank) as Qi, | |
avg(u_bias) as Bu, | |
avg(m_bias) as Bi | |
from ( | |
select | |
train_mf_sgd(userid, movieid, rating, "-factor ${factor} -mu ${mu} -iter ${iters}") as (idx, u_rank, m_rank, u_bias, m_bias) | |
from | |
training | |
) t | |
group by idx; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select avg(rating) from training; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create table sgd_model | |
as | |
select | |
idx, | |
array_avg(u_rank) as Pu, | |
array_avg(m_rank) as Qi, | |
avg(u_bias) as Bu, | |
avg(m_bias) as Bi | |
from ( | |
select | |
train_mf_sgd(userid, movieid, rating, "-factor ${factor} -mu ${mu} -iter ${iters}") as (idx, u_rank, m_rank, u_bias, m_bias) | |
from | |
training | |
) t | |
group by idx; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create table sgd_model | |
as | |
select | |
idx, | |
array_avg(u_rank) as Pu, | |
array_avg(m_rank) as Qi, | |
avg(u_bias) as Bu, | |
avg(m_bias) as Bi | |
from ( | |
select | |
train_mf_sgd(userid, movieid, rating, "-factor ${factor} -mu ${mu} -iter ${iters}") as (idx, u_rank, m_rank, u_bias, m_bias) | |
from | |
training | |
) t | |
group by idx; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hive> desc sgd_model; | |
idx int | |
pu array<float> | |
qi array<float> | |
bu double | |
bi double |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hive> desc sgd_model; | |
idx int | |
pu array<float> | |
qi array<float> | |
bu double | |
bi double |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
select | |
mae(predicted, actual) as mae, | |
rmse(predicted, actual) as rmse | |
from ( | |
select | |
t2.actual, | |
mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted | |
from ( | |
select | |
t1.userid, | |
t1.movieid, | |
t1.rating as actual, | |
p1.Pu, | |
p1.Bu | |
from | |
testing t1 LEFT OUTER JOIN sgd_model p1 | |
ON (t1.userid = p1.idx) | |
) t2 | |
LEFT OUTER JOIN sgd_model p2 | |
ON (t2.movieid = p2.idx) | |
) t; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment