azzaxp · May 2, 2012 10:12 · Jan 3, 2012
diff --git a/apache-logs-hive.sql b/apache-logs-hive.sql
@@ -0,0 +1,93 @@
+-- This is a Hive program. Hive is an SQL-like language that compiles
+-- into Hadoop Map/Reduce jobs. It's very popular among analysts at
+-- Facebook, because it allows them to query enormous Hadoop data
+-- stores using a language much like SQL.
+
+-- Our logs are stored on the Hadoop Distributed File System, in the
+-- directory /logs/randomhacks.net/access.  They're ordinary Apache
+-- logs in *.gz format.
+--
+-- We want to pretend that these gzipped log files are a database table,
+-- and use a regular expression to split lines into database columns.
+CREATE EXTERNAL TABLE access(
+  host STRING,
+  identity STRING,
+  user STRING,
+  time STRING,
+  request STRING,
+  status STRING,
+  size STRING,
+  referer STRING,
+  agent STRING)
+ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
+WITH SERDEPROPERTIES (
+  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
+  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
+)
+STORED AS TEXTFILE
+LOCATION '/logs/randomhacks.net/access';
+
+-- We want to store our logs in HBase, which is designed to hold tables
+-- with billions of rows and millions of columns. HBase stores rows
+-- sorted by primary key, so you can efficiently read all records within
+-- a given range of keys.
+--
+-- Here, our key is a Unix time stamp, and we assume that it always has
+-- the same number of digits (hey, this was a dodgy late night hack).
+-- So we could easily grab all the records from a specific time period.
+--
+-- We store our data in 3 column families: "m" for metadata, "r" for
+-- referrer data, and "a" for user-agent data. This allows us to only
+-- load a subset of columns for a given query.
+CREATE EXTERNAL TABLE access_hbase(
+  key STRING,          -- Unix time + ":" + unique identifier.
+  host STRING,         -- The IP address of the host making the request.
+  identity STRING,     -- ??? (raw log data)
+  user STRING,         -- ??? (raw log data)
+  time BIGINT,         -- Unix time, UTC.
+  method STRING,       -- "GET", etc.
+  path STRING,         -- "/logo.png", etc.
+  protocol STRING,     -- "HTTP/1.1", etc.
+  status SMALLINT,     -- 200, 404, etc.
+  size BIGINT,         -- Response size, in bytes.
+  referer_host STRING, -- "www.google.com", etc.
+  referer STRING,      -- Full referrer string.
+  agent STRING)        -- Full agent string.
+STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
+WITH SERDEPROPERTIES (
+  "hbase.columns.mapping" = ":key,m:host,m:identity,m:user,m:time,m:method,m:path,m:protocol,m:status,m:size,r:referer_host,r:referer,a:agent"
+)
+TBLPROPERTIES ("hbase.table.name" = "randomhacks_access");
+
+-- Copy our data from raw Apache log files to HBase, cleaning it up as we go.  This is basically
+-- a pseudo-SQL query which calls a few Java helpers.
+--
+-- Note the "TABLESAMPLE" clause, which says to pick one of every 20 records at random.
+INSERT OVERWRITE TABLE access_hbase
+  SELECT concat(cast(unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") AS STRING), ":", guid()) AS key,
+         host,
+         unquote_apache(identity),
+         unquote_apache(user),
+         unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]"),
+         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 1) AS method,
+         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 2) AS path,
+         re_extract(unquote_apache(request), "([^ ]*) ([^ ]*) ([^\"]*)", 3) AS protocol,
+         cast(status AS SMALLINT) AS status,
+         cast(size AS BIGINT) AS size,
+         re_extract(unquote_apache(referer), "[^:]+:?/+([^/]*).*", 1) AS referer_host,
+         unquote_apache(referer) AS referer,
+         unquote_apache(agent)
+    FROM access TABLESAMPLE(BUCKET 1 OUT OF 20 ON rand())
+    WHERE unix_timestamp(time, "[dd/MMM/yyyy:HH:mm:ss Z]") IS NOT NULL;
+
+-- Find the 50 most popular pages on the site.
+SELECT path, count(*) AS cnt
+  FROM access_hbase GROUP BY path
+  ORDER BY cnt DESC LIMIT 50;
+
+-- Categorize our articles by year and count how many hits each year received.
+SELECT pubyear, count(*)
+  FROM (SELECT re_extract(path, "/articles/([0-9]+)/.*", 1) AS pubyear
+          FROM access_hbase) access
+  WHERE pubyear IS NOT NULL
+  GROUP BY pubyear;