Prepare following transaction table. We are generating feature_vector
for each item_id
based on cooccurrence of purchased items, a sort of bucket analysis.
userid | item_id | purchase_at timestamp |
---|---|---|
1 | 31231 | 2015-04-9 00:29:02 |
1 | 13212 | 2016-05-24 16:29:02 |
2 | 312 | 2016-06-03 23:29:02 |
3 | 2313 | 2016-06-04 19:29:02 |
.. | .. | .. |
What we want for creating a feature vector for each item is the following cooccurrence
relation.
item | other_item | cnt |
---|---|---|
583266 | 621056 | 9999 |
583266 | 583266 | 18 |
31231 | 13212 | 129 |
31231 | 31231 | 3 |
31231 | 9833 | 953 |
... | ... | ... |
Feature vectors of each item will be as follows:
item | feature_vector array<string> |
---|---|
583266 | 621056:9999, 583266:18 |
31231 | 13212:129, 31231:3, 9833:953 |
... | ... |
Note that value of feature vector should be scaled for k-NN similarity computation e.g., as follows:
item | feature_vector array<string> |
---|---|
583266 | 621056:ln(9999+1) , 583266:ln(18+1) |
31231 | 13212:ln(129+1) , 31231:ln(3+1) , 9833:ln(953+1) |
... | ... |
The following queries results in creating the above table.
INSERT OVERWRITE TABLE user_purchased
select
DISTINCT
userid,
itemid
from
transaction
where
purchased_at < xxx -- divide training/testing data by time
;
WITH cooccurrence as (
select
u1.itemid,
u2.itemid as other_itemid,
-- count(1) as cnt
ln(count(1)+1) as cnt -- scaling to avoid large value in the feature vector
from
training_purchase_log u1
JOIN training_purchase_log u2 ON (u1.userid = u2.userid)
group by
u1.itemid, u2.itemid
having
u1.itemid != u2.itemid
)
INSERT OVERWRITE TABLE item_features
SELECT
itemid,
collect_list(feature(other_itemid, cnt)) as feature_vector
FROM
cooccurrence
group by
itemid
;
WITH similarity as (
select
t1.itemid,
t2.itemid as other,
cosine_similarity(t1.feature_vector, t2.feature_vector) as similarity
from
item_features t1
CROSS JOIN item_features t2
WHERE
t1.itemid != t2.itemid
CLUSTER BY
itemid
),
topk as (
select
each_top_k( -- get top-10 items based on similarity score
10, itemid, similarity,
itemid, other -- output items
) as (rank, similarity, itemid, other)
from
similarity
)
INSERT OVERWRITE TABLE item_similarity
select
itemid, other, similarity
from
topk
;
item | other | similarity |
---|---|---|
583266 | 621056 | 0.33 |
583266 | 583266 | 0.18 |
31231 | 13212 | 1.29 |
31231 | 31231 | 0.3 |
31231 | 9833 | 0.953 |
... | ... | ... |
INSERT OVERWRITE TABLE recently_purchased_items
select
each_top_k( -- get top-5 recently purchased items for each user
5, userid, purchased_at,
userid, itemid
) as (rank, purchased_at, userid, itemid)
from (
select
purchased_at, userid, itemid
from
transactions
where
purchased_at >= xxx -- divide training/test data by time
CLUSTER BY
user_id -- Note CLUSTER BY is mandatory when using each_top_k
) t;
Item-based recommendation based on item similarity scores of user recently purchased items.
WITH topk as (
select
each_top_k(
5, userid, similarity,
userid, other_itemid
) as (rank, similarity, userid, rec_item)
from (
selet DISTINCT -- distinct to avoid same item to be listed
t1.userid,t2.other_itemid, t2.similarity
from
recently_purchased_items t1
JOIN item_similarity t2 ON (t1.itemid = t2.itemid)
where
t1.itemid != t2.other_item -- do not include items that user already purchased
CLUSTER BY
t1.userid -- top-k grouping by userid
) t1
)
INSERT OVERWRITE TABLE item_recommendation
select
userid,
map_values(to_ordered_map(rank, rec_item)) as rec_items
from
topk
group by
userid
;