Last active
June 21, 2016 21:45
-
-
Save nsabharwal/806441d8b92a5635240ba04ae0e16dfd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
su - hdfs | |
git clone https://github.com/pivotalsoftware/pivotal-samples.git | |
# I copied the files on /mnt | |
cd /mnt/pivotal-samples/sample-data | |
# load data into HDFS | |
sh -x load_data_to_HDFS.sh ## it will show you the output of the commands the script is running | |
hdfs dfs -ls /retail_demo | |
#### Hive table ##### | |
https://github.com/pivotalsoftware/pivotal-samples/blob/master/hive/create_hive_tables.sql | |
cd /mnt/pivotal-samples/hive | |
hive -f create_hive_tables.sql | |
hive -e 'use retail_demo; show tables;' ### 9 tables in my case | |
# | |
#load data into hive tables | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/order_lineitems.tsv.gz' OVERWRITE INTO TABLE retail_demo.order_lineitems_hive; | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/orders.tsv.gz' OVERWRITE INTO TABLE retail_demo.orders_hive; | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/products_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.products_dim_hive; | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/categories_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.categories_dim_hive; | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/email_addresses_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.email_addresses_dim_hive; | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/date_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.date_dim_hive; | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/customers_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.customers_dim_hive; | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/payment_methods.tsv.gz' OVERWRITE INTO TABLE retail_demo.payment_methods_hive; | |
LOAD DATA LOCAL INPATH '/mnt/pivotal-samples/sample-data/customer_addresses_dim.tsv.gz' OVERWRITE INTO TABLE retail_demo.customer_addresses_dim_hive; | |
########################### | |
Queries | |
https://github.com/pivotalsoftware/pivotal-samples/blob/master/Labs/Lab_7/Queries.sql | |
-- Every customer (actually, I limited it to 10) with their first order ID/date and last order ID/date | |
select customer_id | |
, first_order_date | |
, first_order_id | |
, last_order_date | |
, last_order_id | |
from (select customer_id | |
, first_value(order_datetime) over (partition by customer_id order by order_datetime asc) as first_order_date | |
, first_value(order_id) over (partition by customer_id order by order_datetime asc) as first_order_id | |
, last_value(order_datetime) over (partition by customer_id order by order_datetime asc) as last_order_date | |
, last_value(order_id) over (partition by customer_id order by order_datetime asc) as last_order_id | |
from hcatalog.retail_demo.orders_hive | |
) base | |
limit 10 | |
; | |
select billing_address_postal_code, sum(total_paid_amount::float8) as total, sum(total_tax_amount::float8) as tax | |
from hcatalog.retail_demo.orders | |
group by billing_address_postal_code | |
order by total desc limit 10; | |
-- Top 10 categories in terms of items sold for all time | |
SELECT product_id | |
, product_category_id | |
, product_count | |
, category_rank | |
FROM (SELECT product_id, product_category_id | |
, SUM(item_quantity::int) AS product_count | |
, row_number() OVER (PARTITION BY product_category_id ORDER BY SUM(item_quantity::int) DESC) AS category_rank | |
FROM hcatalog.retail_demo.order_lineitems_hive | |
GROUP BY product_id, product_category_id | |
) AS lineitems | |
WHERE category_rank <= 10 | |
ORDER BY product_category_id, category_rank | |
; | |
-- Week over week sales numbers | |
SELECT CASE WHEN order_datetime::timestamp < timestamp '2010-10-08' THEN date_trunc('day', order_datetime::timestamp + interval ' 1 week') | |
ELSE date_trunc('day', order_datetime::timestamp) | |
END::date AS order_day | |
, SUM(CASE WHEN order_datetime >= timestamp '2010-01-08' THEN 1 ELSE 0 END) AS num__orders_current | |
, SUM(CASE WHEN order_datetime < timestamp '2010-01-08' THEN 1 ELSE 0 END) AS num__orders_last_week | |
FROM retail_demo.order_lineitems_hive | |
WHERE hcatalog.order_datetime BETWEEN timestamp '2010-10-01' AND timestamp '2010-10-15 23:59:59' | |
GROUP BY 1 | |
ORDER BY 1 | |
; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment