Skip to content

Instantly share code, notes, and snippets.

@kerinin
Created February 18, 2013 19:21
Show Gist options
  • Save kerinin/4979864 to your computer and use it in GitHub Desktop.
Save kerinin/4979864 to your computer and use it in GitHub Desktop.
set default_parallel 92;
read_rate_records = LOAD 's3://oib-mapreduce/output/folder_scan_rollups/2013/06/2013-02-15' AS (
scanned_at:chararray,
received_at:chararray,
mm_id,
source:chararray,
folder:chararray,
read,
subject:chararray,
ip:chararray,
header:chararray,
domain:chararray,
user_id:chararray,
ham:int,
address_book,
important,
tags:chararray,
replied,
forwarded,
spam:int,
has_priority,
is_public,
rp_tag:chararray,
on_content_redaction_whitelist:int,
to_domain:chararray,
eea_address:chararray,
to_address:chararray,
message_id:chararray,
real_folder_name:chararray);
raw_domains = FOREACH read_rate_records GENERATE domain AS domain;
domains = DISTINCT raw_domains;
-- Full Counts
-- Count how many Yahoo/Gmail records we have for each domain
yahoo_messages = FILTER read_rate_records BY ((source == 'YMAIL') AND (domain is not null));
yahoo_grouped = GROUP yahoo_messages BY domain;
yahoo_count_raw = FOREACH yahoo_grouped GENERATE group AS domain, (int)COUNT(yahoo_messages) AS yahoo_count;
yahoo_count_joined = JOIN domains BY domain FULL, yahoo_count_raw BY domain;
yahoo_counts = FOREACH yahoo_count_joined GENERATE domains::domain AS domain, (yahoo_count IS NULL ? 0 : yahoo_count) AS yahoo_count;
gmail_messages = FILTER read_rate_records BY ((source == 'GMAIL') AND (domain is not null));
gmail_grouped = GROUP gmail_messages BY domain;
gmail_count_raw = FOREACH gmail_grouped GENERATE group AS domain, (int)COUNT(gmail_messages) AS gmail_count;
gmail_count_joined = JOIN domains BY domain FULL, gmail_count_raw BY domain;
gmail_counts = FOREACH gmail_count_joined GENERATE domains::domain AS domain, (gmail_count IS NULL ? 0 : gmail_count) AS gmail_count;
-- TIS Counts
-- Count how many Yahoo/Gmail TIS records we have for each domain
yahoo_tis = FILTER yahoo_messages BY (spam == 1);
yahoo_tis_grouped = GROUP yahoo_tis BY domain;
yahoo_tis_raw_counts = FOREACH yahoo_tis_grouped GENERATE group AS domain, (int)COUNT(yahoo_tis) AS yahoo_tis_count;
yahoo_tis_all_domains = JOIN domains BY domain FULL, yahoo_tis_raw_counts BY domain;
yahoo_tis_counts = FOREACH yahoo_tis_all_domains GENERATE domains::domain AS domain, (yahoo_tis_raw_counts::yahoo_tis_count IS NULL ? 0 : yahoo_tis_raw_counts::yahoo_tis_count) AS yahoo_tis_count;
gmail_tis = FILTER gmail_messages BY (spam == 1);
gmail_tis_grouped = GROUP gmail_tis BY domain;
gmail_tis_raw_counts = FOREACH gmail_tis_grouped GENERATE group AS domain, (int)COUNT(gmail_tis) AS gmail_tis_count;
gmail_tis_all_domains = JOIN domains BY domain FULL, gmail_tis_raw_counts BY domain;
gmail_tis_counts = FOREACH gmail_tis_all_domains GENERATE domains::domain AS domain, (gmail_tis_raw_counts::gmail_tis_count IS NULL ? 0 : gmail_tis_raw_counts::gmail_tis_count) AS gmail_tis_count;
-- Unsubscribe Counts
-- Count how many Yahoo/Gmail Unsubscribe records we have for each domain
yahoo_unsubscribe = FILTER yahoo_messages BY (folder == 'Unsubscribe');
yahoo_unsubscribe_grouped = GROUP yahoo_unsubscribe BY domain;
yahoo_unsubscribe_raw_counts = FOREACH yahoo_unsubscribe_grouped GENERATE group AS domain, (int)COUNT(yahoo_unsubscribe) AS yahoo_unsubscribe_count;
yahoo_unsubscribe_all_domains = JOIN domains BY domain FULL, yahoo_unsubscribe_raw_counts BY domain;
yahoo_unsubscribe_counts = FOREACH yahoo_unsubscribe_all_domains GENERATE domains::domain AS domain, (yahoo_unsubscribe_raw_counts::yahoo_unsubscribe_count IS NULL ? 0 : yahoo_unsubscribe_raw_counts::yahoo_unsubscribe_count) AS yahoo_unsubscribe_count;
gmail_unsubscribe = FILTER gmail_messages BY (folder == 'Unsubscribe');
gmail_unsubscribe_grouped = GROUP gmail_unsubscribe BY domain;
gmail_unsubscribe_raw_counts = FOREACH gmail_unsubscribe_grouped GENERATE group AS domain, (int)COUNT(gmail_unsubscribe) AS gmail_unsubscribe_count;
gmail_unsubscribe_all_domains = JOIN domains BY domain FULL, gmail_unsubscribe_raw_counts BY domain;
gmail_unsubscribe_counts = FOREACH gmail_unsubscribe_all_domains GENERATE domains::domain AS domain, (gmail_unsubscribe_raw_counts::gmail_unsubscribe_count IS NULL ? 0 : gmail_unsubscribe_raw_counts::gmail_unsubscribe_count) AS gmail_unsubscribe_count;
-- Wrap-up
joined_raw = JOIN
yahoo_tis_counts BY domain,
yahoo_unsubscribe_counts BY domain,
gmail_tis_counts BY domain,
gmail_unsubscribe_counts BY domain,
yahoo_counts BY domain,
gmail_counts BY domain;
joined = FOREACH joined_raw GENERATE
yahoo_tis_counts::domain AS domain,
yahoo_count AS yahoo_count,
yahoo_tis_count AS yahoo_tis_count,
(yahoo_count == 0 ? NULL : ((float)yahoo_tis_count / (float)yahoo_count)) AS yahoo_tis_rate,
yahoo_unsubscribe_count AS yahoo_unsubscribe_count,
(yahoo_count == 0 ? NULL : ((float)yahoo_unsubscribe_count / (float)yahoo_count)) AS yahoo_unsubscribe_rate,
gmail_count AS gmail_count,
gmail_tis_count AS gmail_tis_count,
(gmail_count == 0 ? NULL : ((float)gmail_tis_count / (float)gmail_count)) AS gmail_tis_rate,
gmail_unsubscribe_count AS gmail_unsubscribe_count,
(gmail_count == 0 ? NULL : ((float)gmail_unsubscribe_count / (float)gmail_count)) AS gmail_unsubscribe_rate;
useful = FILTER joined BY (yahoo_tis_count > 0 OR yahoo_unsubscribe_count > 0 OR gmail_tis_count > 0 OR gmail_unsubscribe_count > 0);
STORE useful INTO 's3://oib-mapreduce-rmichael/gmail_tis';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment