-
-
Save mattbornski/4062959 to your computer and use it in GitHub Desktop.
Apache log analysis with Hadoop, Hive and HBase
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- This is a Hive program. Hive is an SQL-like language that compiles | |
-- into Hadoop Map/Reduce jobs. It's very popular among analysts at | |
-- Facebook, because it allows them to query enormous Hadoop data | |
-- stores using a language much like SQL. | |
-- Our logs are stored on the Hadoop Distributed File System, in the | |
-- directory /logs/randomhacks.net/access. They're ordinary Apache | |
-- logs in *.gz format. | |
-- | |
-- We want to pretend that these gzipped log files are a database table, | |
-- and use a regular expression to split lines into database columns. | |
CREATE EXTERNAL TABLE access( | |
host STRING, | |
identity STRING, | |
user STRING, | |
time STRING, | |
request STRING, | |
status STRING, | |
size STRING, | |
referer STRING, | |
agent STRING) | |
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' | |
WITH SERDEPROPERTIES ( | |
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?", | |
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s" | |
) | |
STORED AS TEXTFILE | |
LOCATION '/logs/randomhacks.net/access'; | |
-- We want to store our logs in HBase, which is designed to hold tables | |
-- with billions of rows and millions of columns. HBase stores rows | |
-- sorted by primary key, so you can efficiently read all records within | |
-- a given range of keys. | |
-- | |
-- Here, our key is a Unix time stamp, and we assume that it always has | |
-- the same number of digits (hey, this was a dodgy late night hack). | |
-- So we could easily grab all the records from a specific time period. | |
-- | |
-- We store our data in 3 column families: "m" for metadata, "r" for | |
-- referrer data, and "a" for user-agent data. This allows us to only | |
-- load a subset of columns for a given query. | |
CREATE EXTERNAL TABLE access_hbase( | |
key STRING, -- Unix time + ":" + unique identifier. | |
host STRING, -- The IP address of the host making the request. | |
identity STRING, -- ??? (raw log data) | |
user STRING, -- ??? (raw log data) | |
time BIGINT, -- Unix time, UTC. | |
method STRING, -- "GET", etc. | |
path STRING, -- "/logo.png", etc. | |
protocol STRING, -- "HTTP/1.1", etc. | |
status SMALLINT, -- 200, 404, etc. | |
size BIGINT, -- Response size, in bytes. | |
referer_host STRING, -- "www.google.com", etc. | |
referer STRING, -- Full referrer string. | |
agent STRING) -- Full agent string. | |
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' | |
WITH SERDEPROPERTIES ( | |
"hbase.columns.mapping" = ":key,m:host,m:identity,m:user,m:time,m:method,m:path,m:protocol,m:status,m:size,r:referer_host,r:referer,a:agent" | |
) | |
TBLPROPERTIES ("hbase.table.name" = "randomhacks_access"); | |
-- Copy our data from raw Apache log files to HBase, cleaning it up as we go. This is basically | |
-- a pseudo-SQL query which calls a few Java helpers. | |
-- | |
-- Note the "TABLESAMPLE" clause, which says to pick one of every 20 records at random. | |
INSERT OVERWRITE TABLE access_hbase | |
SELECT concat(cast(unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") AS STRING), ":", guid()) AS key, | |
host, | |
unquote_apache(identity), | |
unquote_apache(user), | |
unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]"), | |
re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 1) AS method, | |
re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 2) AS path, | |
re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 3) AS protocol, | |
cast(status AS SMALLINT) AS status, | |
cast(size AS BIGINT) AS size, | |
re_extract(unquote_apache(referer), "[^:]+:?/+([^/]*).*", 1) AS referer_host, | |
unquote_apache(referer) AS referer, | |
unquote_apache(agent) | |
FROM access TABLESAMPLE(BUCKET 1 OUT OF 20 ON rand()) | |
WHERE unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") IS NOT NULL; | |
-- Find the 50 most popular pages on the site. | |
SELECT path, count(*) AS cnt | |
FROM access_hbase GROUP BY path | |
ORDER BY cnt DESC LIMIT 50; | |
-- Categorize our articles by year and count how many hits each year received. | |
SELECT pubyear, count(*) | |
FROM (SELECT re_extract(path, "/articles/([0-9]+)/.*", 1) AS pubyear | |
FROM access_hbase) access | |
WHERE pubyear IS NOT NULL | |
GROUP BY pubyear; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment