Skip to content

Instantly share code, notes, and snippets.

@cjmatta
Last active August 29, 2015 14:14
Show Gist options
  • Save cjmatta/d4e5f0423b964707b511 to your computer and use it in GitHub Desktop.
Save cjmatta/d4e5f0423b964707b511 to your computer and use it in GitHub Desktop.
Basic Drill Demo

Drill Demo - MapR

Retail data

Show logs flat JSON:

head -n 5 /mapr/*/data/flat/logs/2012/1/log.json

Show how it can be queried:

select * from mfs.flat.`logs` limit 10;

Show how the directory can be included in the query:

select * from mfs.flat.logs where dir0 = 2012 limit 10; 
select * from mfs.flat.logs where dir0 = 2012 and dir1 >=9 limit 10;

Nested JSON

select * from mfs.nested.clicks limit 10;

Notice that the user_info and trans_info are themselves objects, or maps.

select
c.`trans_id`,
c.`date`,
c.`time`,
c.`user_info`.`cust_id` as `cust_id`,
c.`user_info`.`device` as `device`,
c.`user_info`.`state` as `state`,
c.`trans_info`.`prod_id` as `prod_id`,
c.`trans_info`.`purch_flag` as `purchase_flag`
from mfs.nested.clicks c limit 10;

Casting

SELECT
CAST(`trans_id` as BIGINT) as `trans_id`,
CAST(`date` as DATE) as `date`,
CAST(`time` as TIME) as `time`,
CAST(user_info['cust_id'] as INT) as cust_id,
CAST(user_info['device'] as VARCHAR(255)) as device_type,
CAST(user_info['state'] as VARCHAR(255)) as state,
CAST(ad_info['camp_id'] as VARCHAR(255)) as camp_id,
CAST(trans_info['prod_id'][0] as INT) as prod_id,
CAST(trans_info['purch_flag'] as VARCHAR(255)) as purch_flag
FROM `mfs`.`nested`.`./clicks/clicks.campaign.json` limit 10;

View Creation

Create the nested clickview

CREATE OR REPLACE VIEW mfs.views.`nestedclickview` AS
SELECT
CAST(`trans_id` as BIGINT) as `trans_id`,
CAST(`date` as DATE) as `date`,
CAST(`time` as TIME) as `time`,
CAST(user_info['cust_id'] as INT) as cust_id,
CAST(user_info['device'] as VARCHAR(255)) as device_type,
CAST(user_info['state'] as VARCHAR(255)) as state,
CAST(ad_info['camp_id'] as VARCHAR(255)) as camp_id,
CAST(trans_info['prod_id'][0] as INT) as prod_id,
CAST(trans_info['purch_flag'] as BOOLEAN) as purch_flag
FROM `mfs`.`nested`.`./clicks/clicks.campaign.json`;
After view creation
SELECT t.`date`, count(*) as `count` FROM mfs.views.`nestedclickview` t GROUP BY t.`date` ORDER BY `count` LIMIT 10;

test customer view

select 
cast(row_key as int) as row_key,
cast(`address`['state'] as VARCHAR(255)) as state,
cast(`loyalty`['agg_rev'] as VARCHAR(255)) as agg_rev, 
cast(`loyalty`['membership'] as VARCHAR(255)) as membership,
cast(`personal`['name'] as VARCHAR(255)) as name,
cast(`personal`['age'] as VARCHAR(255)) as age,
cast(`personal`['gender'] as VARCHAR(255)) as gender
from `maprdb`.`customers` limit 10;

Create the customer view

create or replace view mfs.views.custview as select 
cast(row_key as bigint) as cust_id,
cast(`address`['state'] as VARCHAR(255)) as state,
cast(`loyalty`['agg_rev'] as VARCHAR(255)) as agg_rev, 
cast(`loyalty`['membership'] as VARCHAR(255)) as membership,
cast(`personal`['name'] as VARCHAR(255)) as name,
cast(`personal`['age'] as VARCHAR(255)) as age,
cast(`personal`['gender'] as VARCHAR(255)) as gender
from `maprdb`.`customers`;

Membership numbers

select membership, count(1) from mfs.views.custview group by membership;

    
#### Create the clickview

CREATE OR REPLACE VIEW mfs.views.clickview AS SELECT
* FROM `mfs`.`flat`.`logs`;

Test Products table

SELECT
CAST(row_key as INT) as prod_id,
CAST(details['category'] AS VARCHAR(255)) as category,
CAST(details['name'] as VARCHAR(255)) as product_name,
CAST(pricing['price'] as FLOAT) as price
FROM maprdb.`products` LIMIT 100;

Create products view

create or replace view mfs.views.productview as SELECT
CAST(row_key as BIGINT) as prod_id,
CAST(details['category'] AS VARCHAR(255)) as category,
CAST(details['name'] as VARCHAR(255)) as product_name,
CAST(pricing['price'] as FLOAT) as price
FROM maprdb.`products`;

Joins

Join click view to products:

SELECT n.*, p.* FROM `nestedclickview` n JOIN `productview` p ON n.prod_id = p.prod_id limit 10;

Unified view

create or replace view mfs.views.unified as SELECT * 
FROM mfs.views.`nestedclickview` as a
JOIN mfs.views.`productview` b ON a.prod_id = b.prod_id
JOIN mfs.views.`custview` c ON c.cust_id = a.cust_id
JOIN hive.`default`.`orders` d ON d.cust_id = c.cust_id AND d.prod_id = b.prod_id;

Twitter Data

Shell exploration of data

Show that tweet data is stored on MapR-FS has .json files:

$ cd /mapr/cmatta.democluster/user/cmatta/tweets/
$ find . | head -10
.
./blackfriday
./blackfriday/2014
./blackfriday/2014/11
./blackfriday/2014/11/27
./blackfriday/2014/11/27/01
./blackfriday/2014/11/27/01/tweets.json
./blackfriday/2014/11/27/20
./blackfriday/2014/11/27/20/tweets.json
./blackfriday/2014/11/27/15

Show that we can explore this data with the usual unix tools (find, cat, head, even python):

$ head -n 1 ./blackfriday/2014/11/27/01/tweets.json
---messy json---

Show that we can use python's json.tool to pretty print it:

$ head -n 1 ./blackfriday/2014/11/27/01/tweets.json | python -m json.tool
--- pretty json ---

SQLLine

Get just the tweets from the json files:

> select text from `tweets/blackfriday` limit 10;

Get username and tweet:

> select t.`user`.`name`, t.`text` from `tweets/blackfriday` t limit 10;

Twitter views:

Tweets:

create or replace view mfs.views.twitter_tweets as select
CAST(t.`dir3` as INT) as `hour`,
CAST(t.`dir2` as INT) as `day`,
CAST(t.`dir1` as INT) as `month`,
CAST(t.`dir0` as INT) as `year`,
CAST(t.`id` as BIGINT) as `id`,
CAST(t.`user`.`id` as BIGINT) as `user_id`,
CAST(t.`text` as VARCHAR(140)) as `tweet`,
to_timestamp(t.`created_at`, 'EEE MMM dd HH:mm:ss Z YYYY') as `created_at`,
CAST(t.`favorited` as BOOLEAN) as `favorited`,
CAST(t.`favorite_count` as INT) as `favorite_count`,
CAST(t.`retweeted` as BOOLEAN) as `retweeted`,
CAST(t.`retweet_count` as INT) as `retweet_count`,
CAST(t.`possibly_sensitive` as BOOLEAN) as `possibly_sensetive`,
CAST(t.`place` as VARCHAR(140)) as `place`,
t.`coordinates`,
CAST(t.`source` as VARCHAR(140)) as `source`
from mfs.`/user/cmatta/tweets/blackfriday` t;

Users:

create or replace view mfs.views.twitter_users as
select CAST(t.`user`.`id` as BIGINT) as `user_id`,
CAST(t.`user`.`name` as VARCHAR(128)) as `name`,
CAST(t.`user`.`screen_name` as VARCHAR(128)) as `screen_name`,
CAST(t.`user`.`location` as VARCHAR(128)) as `location`,
CAST(t.`user`.`lang` as VARCHAR(128)) as `language`
from mfs.`/user/cmatta/tweets` t;

Yelp data

show files in `yelp`;

select * from mfs.cmatta.`yelp/yelp_academic_dataset_business.json` limit 10;

select name, hours from mfs.cmatta.`yelp/yelp_academic_dataset_business.json` limit 10;

select name, kvgen(hours) from mfs.cmatta.`yelp/yelp_academic_dataset_business.json` limit 10;

select name, flatten(kvgen(hours)) from mfs.cmatta.`yelp/yelp_academic_dataset_business.json` limit 10;

Philly crime database

source: https://www.opendataphilly.org/dataset/philadelphia-police-part-one-crime-incidents

Show CSV data

head -5 /mapr/cmatta.democluster/data/other/police/police_inct.csv

Show raw select of CSV file

SELECT * FROM mfs.root.`other/police/police_inct.csv` limit 10;

View create:

CREATE OR REPLACE VIEW mfs.views.`crime` AS SELECT
CAST(`columns`[0] AS INTEGER) AS `DC_DIST`,
CAST(`columns`[2] AS TIMESTAMP) AS `datetime`,
CAST(`columns`[7] AS VARCHAR(128)) AS `location`,
CAST(`columns`[10] AS VARCHAR(128)) AS `general_code`,
CAST(CASE WHEN `columns`[11] = '' THEN '0' ELSE `columns`[11] END AS DOUBLE) AS `latitude`,
CAST(CASE WHEN `columns`[12] = '' THEN '0' ELSE `columns`[12] END AS DOUBLE) AS `longitude`
FROM `mfs`.`root`.`other/police/police_inct.csv` AS `p`

Queries:

Common Crimes

select `general_code`, count(*) as `count` FROM `crime` GROUP BY `general_code` ORDER BY `count` DESC limit 10;

High crime areas, and the crimes

select `location`, `general_code`, count(*) as `count` FROM `crime` GROUP BY `location`, `general_code` ORDER BY `count` DESC limit 10;

Crimes by Day of Week

SELECT to_char(`datetime`, 'E') as `weekday`, COUNT(*) as `count` FROM `crime` GROUP BY to_char(`datetime`, 'E') ORDER BY `count` DESC;

Crimes by Hour

SELECT DATE_PART('HOUR', `datetime`) as `Hour`, COUNT(*) as `count` FROM `crime` GROUP BY DATE_PART('HOUR', `datetime`) ORDER BY `count` DESC;

Weather

CREATE OR REPLACE VIEW mfs.views.`weather` AS SELECT
CAST(CASE WHEN `columns`[1] = 'N/A' THEN NULL WHEN `columns`[1] = '' THEN NULL ELSE `columns`[1] END as FLOAT) as `temperaturef`,
CAST(CASE WHEN `columns`[2] = 'N/A' THEN NULL WHEN `columns`[2] = '' THEN NULL ELSE `columns`[2] END as FLOAT) as `dewpointf`,
CAST(CASE WHEN `columns`[3] = 'N/A' THEN NULL WHEN `columns`[3] = '' THEN NULL ELSE `columns`[3] END as FLOAT) as `humidity`,
CAST(CASE WHEN `columns`[9] = 'N/A' THEN NULL WHEN `columns`[9] = '' THEN NULL ELSE `columns`[9] END AS FLOAT) AS `precipitationin`,
CAST(CASE WHEN `columns`[10] = 'N/A' THEN NULL WHEN `columns`[10] = '' THEN NULL ELSE `columns`[10] END as VARCHAR(128)) as `events`,
CAST(CASE WHEN `columns`[11] = 'N/A' THEN NULL WHEN `columns`[11] = '' THEN NULL ELSE `columns`[11] END as VARCHAR(128)) as `conditions`,
CAST(CASE WHEN `columns`[12] = 'N/A' THEN NULL WHEN `columns`[12] = '' THEN NULL ELSE `columns`[12] END as FLOAT) as `winddirectiondegrees`,
CAST(CASE WHEN `columns`[13] = 'N/A' THEN NULL WHEN `columns`[13] = '' THEN NULL ELSE `columns`[13] END as TIMESTAMP) as `datetime`,
CAST(`dir0` as VARCHAR(8)) as `airportcode`,
CAST(`dir1` as INT) as `year`,
CAST(`dir2` as INT) as `month`
FROM mfs.root.`other/weather`;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment