Last active
August 29, 2015 14:02
-
-
Save kerinin/98e0e620c490e1dcd976 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- This will need to be built and put on the machine you're running Hive from | |
add jar json-serde-1.1.9.3-SNAPSHOT-jar-with-dependencies.jar; | |
-- Creates the table based on the JSON schema | |
CREATE TABLE json_nested_test ( | |
metadata struct<uuid:string,md5_email:string,received_at:string,message_id:string,from_domain:string,message_signature:string,provider:string>, | |
headers map<string,struct<length:string,match_counts:map<string,string>,text_hash_counts:array<array<string>>,address_hashes:array<string>>>, | |
parts array<struct<mime_type:string,length:string,stripped_length:string,match_counts:map<string,string>,urls:array<struct<domain:string,sha:string>>,images:array<struct<domain:string,sha:string,pixel_count:string>>,text_hash_counts:array<array<string>>>>, | |
attachments array<struct<mime_type:string,bytesize:string,filename_sha:string,filetype:string>> | |
) | |
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' | |
STORED AS TEXTFILE; | |
-- Eliminates exceptions if the JSON has issues (duplicate keys, etc) | |
ALTER TABLE json_nested_test SET SERDEPROPERTIES ( "ignore.malformed.json" = "true"); | |
-- Loads one of the files | |
LOAD DATA LOCAL INPATH '/na/mapr/data/extracted_message_features/v1/2014/21/2014-06-01/p_39_1648813793180' OVERWRITE INTO TABLE json_nested_test ; | |
-- Select the match counts where the first part is 'text/html' | |
SELECT parts[0].match_counts | |
FROM json_nested_test | |
WHERE parts[0].mime_type == 'text/html' | |
LIMIT 10; | |
-- Select [uuid, hash, count] triplets for all 'text/plain' parts | |
SELECT metadata.uuid, part_hash[0], part_hash[1] | |
FROM | |
json_nested_test | |
LATERAL VIEW explode(parts.text_hash_counts) LineHashes as line_hash | |
LATERAL VIEW explode(parts.mime_type) MimeTypes as mime_type | |
LATERAL VIEW explode(line_hash) PartHashes as part_hash | |
WHERE mime_type == 'text/plain' | |
LIMIT 10; | |
DROP TABLE json_nested_test; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment