Last active
August 29, 2015 13:57
-
-
Save mwinkle/9808807 to your computer and use it in GitHub Desktop.
This is the result of work done with https://issues.apache.org/jira/browse/HIVE-3554 and https://issues.apache.org/jira/browse/HIVE-5795 that enable easy traversal of subdirectories, and exclusion of header rows. This makes Hive a turnkey way to query any type of log data partitioned by time series, without having to explicitly program the parti…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE websites_logs_raw (datereq STRING, | |
timereq STRING, | |
s_sitename STRING, | |
cs_method STRING, | |
cs_uri_stem STRING, | |
cs_uri_query STRING, | |
s_port STRING, | |
cs_username STRING, | |
c_ip STRING, | |
cs_User_Agent STRING, | |
cs_Cookie STRING, | |
cs_Referer STRING, | |
cs_host STRING, | |
sc_status INT, | |
sc_substatus STRING, | |
sc_win32_status STRING, | |
sc_bytes INT, | |
cs_bytes INT, | |
time_taken INT | |
) | |
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' | |
STORED AS TEXTFILE | |
LOCATION 'wasb://[email protected]/path_to_logs' | |
tblproperties ("skip.header.line.count"="1"); | |
set mapred.input.dir.recursive=true; | |
set hive.mapred.supports.subdirectories=true; | |
select cs_Referer, count(*) as totalCount, avg(time_taken) as averageTime, sum(sc_bytes) as totalBytes | |
from websites_logs_raw | |
where sc_status =200 | |
group by cs_Referer | |
order by totalCount DESC | |
limit 25; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment