Item-base collabolative filtering

1. Prepare transaction table

Prepare following transaction table. We are generating feature_vector for each item_id based on cooccurrence of purchased items, a sort of bucket analysis.

userid	item_id	purchase_at `timestamp`
1	31231	2015-04-9 00:29:02
1	13212	2016-05-24 16:29:02
2	312	2016-06-03 23:29:02
3	2313	2016-06-04 19:29:02
..	..	..

2. Create item_features table

What we want for creating a feature vector for each item is the following cooccurrence relation.

item	other_item	cnt
583266	621056	9999
583266	583266	18
31231	13212	129
31231	31231	3
31231	9833	953
...	...	...

Feature vectors of each item will be as follows:

item	feature_vector `array<string>`
583266	621056:9999, 583266:18
31231	13212:129, 31231:3, 9833:953
...	...

Note that value of feature vector should be scaled for k-NN similarity computation e.g., as follows:

item	feature_vector `array<string>`
583266	621056:`ln(9999+1)`, 583266:`ln(18+1)`
31231	13212:`ln(129+1)`, 31231:`ln(3+1)`, 9833:`ln(953+1)`
...	...

The following queries results in creating the above table.

INSERT OVERWRITE TABLE user_purchased
select 
  DISTINCT
    userid,
    itemid
from
  transaction
where 
  purchased_at < xxx -- divide training/testing data by time
;

WITH cooccurrence as (
  select
    u1.itemid,
    u2.itemid as other_itemid, 
    -- count(1) as cnt
    ln(count(1)+1) as cnt -- scaling to avoid large value in the feature vector
  from
    training_purchase_log u1
    JOIN training_purchase_log u2 ON (u1.userid = u2.userid)
  group by
    u1.itemid, u2.itemid
  having
    u1.itemid != u2.itemid
)
INSERT OVERWRITE TABLE item_features
SELECT
  itemid,
  collect_list(feature(other_itemid, cnt)) as feature_vector
FROM
  cooccurrence
group by
  itemid
;

Computing Item similarity scores

WITH similarity as (
  select
   t1.itemid,
   t2.itemid as other,
   cosine_similarity(t1.feature_vector, t2.feature_vector) as similarity
  from
   item_features t1
   CROSS JOIN item_features t2
  WHERE
    t1.itemid != t2.itemid
CLUSTER BY
   itemid
),
topk as (
  select
    each_top_k( -- get top-10 items based on similarity score
      10, itemid, similarity,
      itemid, other -- output items
    ) as (rank, similarity, itemid, other)
  from 
    similarity
)
INSERT OVERWRITE TABLE item_similarity
select 
  itemid, other, similarity
from 
  topk
;

item	other	similarity
583266	621056	0.33
583266	583266	0.18
31231	13212	1.29
31231	31231	0.3
31231	9833	0.953
...	...	...

Computes top-k recently purchaed items for each user

INSERT OVERWRITE TABLE recently_purchased_items
select
  each_top_k( -- get top-5 recently purchased items for each user
     5, userid, purchased_at,
     userid, itemid
  ) as (rank, purchased_at, userid, itemid)
from (
  select
    purchased_at, userid, itemid
  from 
    transactions
  where
    purchased_at >= xxx -- divide training/test data by time
  CLUSTER BY
    user_id -- Note CLUSTER BY is mandatory when using each_top_k
) t;

Recommend top-k Items based on similarity scores for each user

Item-based recommendation based on item similarity scores of user recently purchased items.

WITH topk as (
  select
    each_top_k(
       5, userid, similarity,
       userid, other_itemid
    ) as (rank, similarity, userid, rec_item)
  from (
    selet DISTINCT -- distinct to avoid same item to be listed
      t1.userid，t2.other_itemid, t2.similarity
    from
      recently_purchased_items t1
      JOIN item_similarity t2 ON (t1.itemid = t2.itemid)
    where
      t1.itemid != t2.other_item -- do not include items that user already purchased
    CLUSTER BY
      t1.userid -- top-k grouping by userid
  ) t1
)
INSERT OVERWRITE TABLE item_recommendation
select
  userid,
  map_values(to_ordered_map(rank, rec_item)) as rec_items
from
  topk
group by
  userid
;