Created
May 17, 2012 17:28
-
-
Save davidsnyder/2720382 to your computer and use it in GitHub Desktop.
Jaccard Similarity Score for the Netflix graph
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
edges = LOAD '$GRAPH' AS (user_id:chararray,movie_id:chararray); | |
edges_dup = LOAD '$GRAPH' AS (user_id:chararray,movie_id:chararray); -- hack for self-join | |
-- (user_id,|A|) | |
grouped_edges = GROUP edges by user_id; --reduce | |
outgoing_links = FOREACH grouped_edges GENERATE --map | |
group AS user_id, | |
COUNT(edges) AS user_out; | |
-- (movie_id,user_a_id,movie_id,user_b_id) | |
movie_groups = COGROUP edges by movie_id INNER,edges_dup by movie_id INNER; --reduce | |
flat_groups = FOREACH movie_groups GENERATE --map | |
FLATTEN(edges), | |
FLATTEN(edges_dup); | |
-- projection (user_a_id,user_b_id,movie_id) | |
snipped_groups = FOREACH flat_groups GENERATE --map | |
edges::user_id AS user_a_id, | |
edges_dup::user_id AS user_b_id, | |
edges::movie_id AS movie_id; | |
-- (user_a_id,user_b_id,|A intersection B|) | |
grouped_groups = GROUP snipped_groups by (user_a_id,user_b_id); --reduce | |
intersection = FOREACH grouped_groups GENERATE --map | |
FLATTEN(group) AS (user_a_id,user_b_id), | |
COUNT(snipped_groups) AS intersection_size; | |
--reduce/map | |
joined_a = JOIN intersection by $0, outgoing_links by user_id; --append (user_a_id,|A|) | |
joined_b = JOIN joined_a by $1, outgoing_links by user_id; --append (user_b_id,|B|) | |
-- (user_a_id,user_b_id,|A intersection B|,user_a_id,|A|,user_b_id,|B|) | |
jaccard = FOREACH joined_b { --map | |
-- |A int B| / |A| + |B| - |A int B| (which is just |A union B|) | |
sim = (float)$2 / ((int)$4 + (int)$6 - (int)$2); | |
GENERATE | |
$0 AS user_a_id, | |
$1 AS user_b_id, | |
sim AS jaccard_sim; | |
}; | |
--map | |
filtered = FILTER jaccard by jaccard_sim < 1.0; -- prune self matches | |
sorted = ORDER filtered by *; | |
grouped = GROUP sorted by user_a_id; --reduce | |
topped = FOREACH grouped { --map | |
top_jac = TOP(1,2,sorted); -- take only the user with the highest jaccard sim | |
GENERATE FLATTEN(top_jac); | |
}; | |
describe topped; | |
STORE topped INTO '$OUT'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment