Skip to content

Instantly share code, notes, and snippets.

@nsabharwal
Last active June 21, 2016 21:45
Show Gist options
  • Save nsabharwal/806441d8b92a5635240ba04ae0e16dfd to your computer and use it in GitHub Desktop.
Save nsabharwal/806441d8b92a5635240ba04ae0e16dfd to your computer and use it in GitHub Desktop.
su - hdfs
git clone https://github.com/pivotalsoftware/pivotal-samples.git
# I copied the files on /mnt
cd /mnt/pivotal-samples/sample-data
# load data into HDFS
sh -x load_data_to_HDFS.sh ## it will show you the output of the commands the script is running
hdfs dfs -ls /retail_demo
#### Hive table #####
https://github.com/pivotalsoftware/pivotal-samples/blob/master/hive/create_hive_tables.sql
cd /mnt/pivotal-samples/hive
hive -f create_hive_tables.sql
hive -e 'use retail_demo; show tables;' ### 9 tables in my case
#
#load data into hive tables
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/order_lineitems.tsv.gz' OVERWRITE INTO TABLE retail_demo.order_lineitems_hive;
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/orders.tsv.gz' OVERWRITE INTO TABLE retail_demo.orders_hive;
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/products_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.products_dim_hive;
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/categories_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.categories_dim_hive;
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/email_addresses_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.email_addresses_dim_hive;
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/date_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.date_dim_hive;
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/customers_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.customers_dim_hive;
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/payment_methods.tsv.gz' OVERWRITE INTO TABLE retail_demo.payment_methods_hive;
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/customer_addresses_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.customer_addresses_dim_hive;
###########################
Queries
https://github.com/pivotalsoftware/pivotal-samples/blob/master/Labs/Lab_7/Queries.sql
-- Every customer (actually, I limited it to 10) with their first order ID/date and last order ID/date
select customer_id
, first_order_date
, first_order_id
, last_order_date
, last_order_id
from (select customer_id
, first_value(order_datetime) over (partition by customer_id order by order_datetime asc) as first_order_date
, first_value(order_id) over (partition by customer_id order by order_datetime asc) as first_order_id
, last_value(order_datetime) over (partition by customer_id order by order_datetime asc) as last_order_date
, last_value(order_id) over (partition by customer_id order by order_datetime asc) as last_order_id
from hcatalog.retail_demo.orders_hive
) base
limit 10
;
select billing_address_postal_code, sum(total_paid_amount::float8) as total, sum(total_tax_amount::float8) as tax
from hcatalog.retail_demo.orders
group by billing_address_postal_code
order by total desc limit 10;
-- Top 10 categories in terms of items sold for all time
SELECT product_id
, product_category_id
, product_count
, category_rank
FROM (SELECT product_id, product_category_id
, SUM(item_quantity::int) AS product_count
, row_number() OVER (PARTITION BY product_category_id ORDER BY SUM(item_quantity::int) DESC) AS category_rank
FROM hcatalog.retail_demo.order_lineitems_hive
GROUP BY product_id, product_category_id
) AS lineitems
WHERE category_rank <= 10
ORDER BY product_category_id, category_rank
;
-- Week over week sales numbers
SELECT CASE WHEN order_datetime::timestamp < timestamp '2010-10-08' THEN date_trunc('day', order_datetime::timestamp + interval ' 1 week')
ELSE date_trunc('day', order_datetime::timestamp)
END::date AS order_day
, SUM(CASE WHEN order_datetime >= timestamp '2010-01-08' THEN 1 ELSE 0 END) AS num__orders_current
, SUM(CASE WHEN order_datetime < timestamp '2010-01-08' THEN 1 ELSE 0 END) AS num__orders_last_week
FROM retail_demo.order_lineitems_hive
WHERE hcatalog.order_datetime BETWEEN timestamp '2010-10-01' AND timestamp '2010-10-15 23:59:59'
GROUP BY 1
ORDER BY 1
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment