Created
February 18, 2013 19:21
-
-
Save kerinin/4979864 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set default_parallel 92; | |
read_rate_records = LOAD 's3://oib-mapreduce/output/folder_scan_rollups/2013/06/2013-02-15' AS ( | |
scanned_at:chararray, | |
received_at:chararray, | |
mm_id, | |
source:chararray, | |
folder:chararray, | |
read, | |
subject:chararray, | |
ip:chararray, | |
header:chararray, | |
domain:chararray, | |
user_id:chararray, | |
ham:int, | |
address_book, | |
important, | |
tags:chararray, | |
replied, | |
forwarded, | |
spam:int, | |
has_priority, | |
is_public, | |
rp_tag:chararray, | |
on_content_redaction_whitelist:int, | |
to_domain:chararray, | |
eea_address:chararray, | |
to_address:chararray, | |
message_id:chararray, | |
real_folder_name:chararray); | |
raw_domains = FOREACH read_rate_records GENERATE domain AS domain; | |
domains = DISTINCT raw_domains; | |
-- Full Counts | |
-- Count how many Yahoo/Gmail records we have for each domain | |
yahoo_messages = FILTER read_rate_records BY ((source == 'YMAIL') AND (domain is not null)); | |
yahoo_grouped = GROUP yahoo_messages BY domain; | |
yahoo_count_raw = FOREACH yahoo_grouped GENERATE group AS domain, (int)COUNT(yahoo_messages) AS yahoo_count; | |
yahoo_count_joined = JOIN domains BY domain FULL, yahoo_count_raw BY domain; | |
yahoo_counts = FOREACH yahoo_count_joined GENERATE domains::domain AS domain, (yahoo_count IS NULL ? 0 : yahoo_count) AS yahoo_count; | |
gmail_messages = FILTER read_rate_records BY ((source == 'GMAIL') AND (domain is not null)); | |
gmail_grouped = GROUP gmail_messages BY domain; | |
gmail_count_raw = FOREACH gmail_grouped GENERATE group AS domain, (int)COUNT(gmail_messages) AS gmail_count; | |
gmail_count_joined = JOIN domains BY domain FULL, gmail_count_raw BY domain; | |
gmail_counts = FOREACH gmail_count_joined GENERATE domains::domain AS domain, (gmail_count IS NULL ? 0 : gmail_count) AS gmail_count; | |
-- TIS Counts | |
-- Count how many Yahoo/Gmail TIS records we have for each domain | |
yahoo_tis = FILTER yahoo_messages BY (spam == 1); | |
yahoo_tis_grouped = GROUP yahoo_tis BY domain; | |
yahoo_tis_raw_counts = FOREACH yahoo_tis_grouped GENERATE group AS domain, (int)COUNT(yahoo_tis) AS yahoo_tis_count; | |
yahoo_tis_all_domains = JOIN domains BY domain FULL, yahoo_tis_raw_counts BY domain; | |
yahoo_tis_counts = FOREACH yahoo_tis_all_domains GENERATE domains::domain AS domain, (yahoo_tis_raw_counts::yahoo_tis_count IS NULL ? 0 : yahoo_tis_raw_counts::yahoo_tis_count) AS yahoo_tis_count; | |
gmail_tis = FILTER gmail_messages BY (spam == 1); | |
gmail_tis_grouped = GROUP gmail_tis BY domain; | |
gmail_tis_raw_counts = FOREACH gmail_tis_grouped GENERATE group AS domain, (int)COUNT(gmail_tis) AS gmail_tis_count; | |
gmail_tis_all_domains = JOIN domains BY domain FULL, gmail_tis_raw_counts BY domain; | |
gmail_tis_counts = FOREACH gmail_tis_all_domains GENERATE domains::domain AS domain, (gmail_tis_raw_counts::gmail_tis_count IS NULL ? 0 : gmail_tis_raw_counts::gmail_tis_count) AS gmail_tis_count; | |
-- Unsubscribe Counts | |
-- Count how many Yahoo/Gmail Unsubscribe records we have for each domain | |
yahoo_unsubscribe = FILTER yahoo_messages BY (folder == 'Unsubscribe'); | |
yahoo_unsubscribe_grouped = GROUP yahoo_unsubscribe BY domain; | |
yahoo_unsubscribe_raw_counts = FOREACH yahoo_unsubscribe_grouped GENERATE group AS domain, (int)COUNT(yahoo_unsubscribe) AS yahoo_unsubscribe_count; | |
yahoo_unsubscribe_all_domains = JOIN domains BY domain FULL, yahoo_unsubscribe_raw_counts BY domain; | |
yahoo_unsubscribe_counts = FOREACH yahoo_unsubscribe_all_domains GENERATE domains::domain AS domain, (yahoo_unsubscribe_raw_counts::yahoo_unsubscribe_count IS NULL ? 0 : yahoo_unsubscribe_raw_counts::yahoo_unsubscribe_count) AS yahoo_unsubscribe_count; | |
gmail_unsubscribe = FILTER gmail_messages BY (folder == 'Unsubscribe'); | |
gmail_unsubscribe_grouped = GROUP gmail_unsubscribe BY domain; | |
gmail_unsubscribe_raw_counts = FOREACH gmail_unsubscribe_grouped GENERATE group AS domain, (int)COUNT(gmail_unsubscribe) AS gmail_unsubscribe_count; | |
gmail_unsubscribe_all_domains = JOIN domains BY domain FULL, gmail_unsubscribe_raw_counts BY domain; | |
gmail_unsubscribe_counts = FOREACH gmail_unsubscribe_all_domains GENERATE domains::domain AS domain, (gmail_unsubscribe_raw_counts::gmail_unsubscribe_count IS NULL ? 0 : gmail_unsubscribe_raw_counts::gmail_unsubscribe_count) AS gmail_unsubscribe_count; | |
-- Wrap-up | |
joined_raw = JOIN | |
yahoo_tis_counts BY domain, | |
yahoo_unsubscribe_counts BY domain, | |
gmail_tis_counts BY domain, | |
gmail_unsubscribe_counts BY domain, | |
yahoo_counts BY domain, | |
gmail_counts BY domain; | |
joined = FOREACH joined_raw GENERATE | |
yahoo_tis_counts::domain AS domain, | |
yahoo_count AS yahoo_count, | |
yahoo_tis_count AS yahoo_tis_count, | |
(yahoo_count == 0 ? NULL : ((float)yahoo_tis_count / (float)yahoo_count)) AS yahoo_tis_rate, | |
yahoo_unsubscribe_count AS yahoo_unsubscribe_count, | |
(yahoo_count == 0 ? NULL : ((float)yahoo_unsubscribe_count / (float)yahoo_count)) AS yahoo_unsubscribe_rate, | |
gmail_count AS gmail_count, | |
gmail_tis_count AS gmail_tis_count, | |
(gmail_count == 0 ? NULL : ((float)gmail_tis_count / (float)gmail_count)) AS gmail_tis_rate, | |
gmail_unsubscribe_count AS gmail_unsubscribe_count, | |
(gmail_count == 0 ? NULL : ((float)gmail_unsubscribe_count / (float)gmail_count)) AS gmail_unsubscribe_rate; | |
useful = FILTER joined BY (yahoo_tis_count > 0 OR yahoo_unsubscribe_count > 0 OR gmail_tis_count > 0 OR gmail_unsubscribe_count > 0); | |
STORE useful INTO 's3://oib-mapreduce-rmichael/gmail_tis'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment