Created
July 16, 2012 23:37
-
-
Save kiyoto/3125858 to your computer and use it in GitHub Desktop.
Book Crossing Dataset Status Queries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Queries 1 | |
td query -w -d book_crossing_dataset " | |
SELECT t AS type, cnt | |
FROM | |
( | |
SELECT COUNT(*) AS cnt, 'only in users' AS t | |
FROM | |
( | |
SELECT user_id | |
FROM users | |
GROUP BY user_id | |
) t1 | |
LEFT OUTER JOIN | |
( | |
SELECT user_id | |
FROM ratings | |
GROUP BY user_id | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
WHERE t2.user_id IS NULL | |
UNION ALL | |
SELECT COUNT(*) AS cnt, 'only in ratings' AS t | |
FROM | |
( | |
SELECT user_id | |
FROM ratings | |
GROUP BY user_id | |
) t1 | |
LEFT OUTER JOIN | |
( | |
SELECT user_id | |
FROM users | |
GROUP BY user_id | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
WHERE t2.user_id IS NULL | |
UNION ALL | |
SELECT COUNT(*) AS cnt, 'both' AS t | |
FROM | |
( | |
SELECT user_id | |
FROM ratings | |
GROUP BY user_id | |
) t1 | |
JOIN | |
( | |
SELECT user_id | |
FROM users | |
GROUP BY user_id | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
) o1 | |
" | |
td query -w -d book_crossing_dataset " | |
SELECT t1.cnt AS all_books, t2.cnt AS active_books, ROUND(t2.cnt/t1.cnt*100) AS active_rate | |
FROM | |
( | |
SELECT COUNT(distinct isbn) as cnt, 1 AS one | |
FROM books | |
) t1 | |
JOIN | |
( | |
SELECT COUNT(distinct isbn) as cnt, 1 AS one | |
FROM ratings | |
) t2 | |
ON | |
(t1.one=t2.one) | |
" | |
td query -w -d book_crossing_dataset " | |
SELECT t AS type, cnt | |
FROM | |
( | |
SELECT COUNT(*) AS cnt, 'only in books' AS t | |
FROM | |
( | |
SELECT isbn | |
FROM books | |
GROUP BY isbn | |
) t1 | |
LEFT OUTER JOIN | |
( | |
SELECT isbn | |
FROM ratings | |
GROUP BY isbn | |
) t2 | |
ON | |
(t1.isbn=t2.isbn) | |
WHERE t2.isbn IS NULL | |
UNION ALL | |
SELECT COUNT(*) AS cnt, 'only in ratings' AS t | |
FROM | |
( | |
SELECT isbn | |
FROM ratings | |
GROUP BY isbn | |
) t1 | |
LEFT OUTER JOIN | |
( | |
SELECT isbn | |
FROM books | |
GROUP BY isbn | |
) t2 | |
ON | |
(t1.isbn=t2.isbn) | |
WHERE t2.isbn IS NULL | |
UNION ALL | |
SELECT COUNT(*) AS cnt, 'both' AS t | |
FROM | |
( | |
SELECT isbn | |
FROM ratings | |
GROUP BY isbn | |
) t1 | |
JOIN | |
( | |
SELECT isbn | |
FROM books | |
GROUP BY isbn | |
) t2 | |
ON | |
(t1.isbn=t2.isbn) | |
) o1 | |
" | |
-- Queries 2 | |
-- Per country | |
td query -w -d book_crossing_dataset -f csv -o user_dist_of_country.csv " | |
SELECT country, cnt, ROUND(cnt/total_cnt*10000)/100 AS rate | |
FROM | |
( | |
SELECT country, COUNT(*) as cnt, 1 AS one | |
FROM users | |
WHERE country != '' AND country != 'n/a' | |
GROUP BY country | |
ORDER BY cnt DESC | |
LIMIT 30 | |
) t1 | |
JOIN | |
( | |
SELECT COUNT(country) as total_cnt, 1 AS one | |
FROM users | |
WHERE country != '' AND country != 'n/a' | |
) t2 | |
ON | |
(t1.one=t2.one) | |
" | |
-- Per state in the US | |
td query -w -d book_crossing_dataset -f csv -o user_dist_of_location.csv " | |
SELECT location1 AS state, cnt, ROUND(cnt/total_cnt*10000)/100 AS rate | |
FROM | |
( | |
SELECT location1, COUNT(*) as cnt, 1 AS one | |
FROM users | |
WHERE country = 'usa' | |
GROUP BY location1 | |
ORDER BY cnt DESC | |
LIMIT 30 | |
) t1 | |
JOIN | |
( | |
SELECT COUNT(*) as total_cnt, 1 AS one | |
FROM users | |
WHERE country = 'usa' | |
) t2 | |
ON | |
(t1.one=t2.one) | |
" | |
-- Queries 3 | |
td query -w -d book_crossing_dataset " | |
SELECT generation, cnt | |
FROM | |
( | |
SELECT CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation, COUNT(*) as cnt | |
FROM users | |
WHERE IF(age='NULL',0,CAST(age AS INT)) <= 65 | |
GROUP BY CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 | |
UNION ALL | |
SELECT CAST(65 AS BIGINT) AS generation, COUNT(*) as cnt | |
FROM users | |
WHERE 65 < CAST(age AS INT) | |
) t1 | |
ORDER BY generation | |
" | |
-- filtered histogram | |
td query -w -d book_crossing_dataset -f csv -o user_dist_of_generation.csv " | |
SELECT generation, cnt, ROUND(cnt/total_cnt*1000)/10 AS rate | |
FROM | |
( | |
SELECT CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation, COUNT(*) as cnt, 1 AS one | |
FROM users | |
WHERE 10 < IF(age='NULL',0,CAST(age AS INT)) | |
AND IF(age='NULL',0,CAST(age AS INT)) <= 65 | |
GROUP BY CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 | |
) t1 | |
JOIN | |
( | |
SELECT COUNT(*) as total_cnt, 1 AS one | |
FROM users | |
WHERE 10 < IF(age='NULL',0,CAST(age AS INT)) | |
AND IF(age='NULL',0,CAST(age AS INT)) <= 65 | |
) t2 | |
ON | |
(t1.one=t2.one) | |
ORDER BY generation | |
" | |
-- Queries 4 | |
-- Average # of reviews per user | |
td query -w -d book_crossing_dataset " | |
SELECT ROUND(AVG(cnt)) AS avg | |
FROM | |
( | |
SELECT user_id, COUNT(book_rating) AS cnt | |
FROM ratings | |
GROUP BY user_id | |
) t1 | |
" | |
-- Distribution of the review ratings | |
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_count.csv " | |
SELECT cnt, user_cnt, ROUND(user_cnt/total_user_cnt*1000)/10 AS rate | |
FROM | |
( | |
SELECT cnt, COUNT(*) AS user_cnt, 1 AS one | |
FROM | |
( | |
SELECT user_id, COUNT(book_rating) AS cnt | |
FROM ratings | |
GROUP BY user_id | |
) t1 | |
GROUP BY cnt | |
ORDER BY cnt | |
LIMIT 50 | |
) o1 | |
JOIN | |
( | |
SELECT COUNT(distinct user_id) AS total_user_cnt, 1 AS one | |
FROM ratings | |
) o2 | |
ON | |
(o1.one=o2.one) | |
" | |
-- (0<book_rating, 5<=count(book_rating)) -- | |
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_avg.csv " | |
SELECT avg, COUNT(*) | |
FROM | |
( | |
SELECT user_id, ROUND(AVG(book_rating)/0.5)*0.5 AS avg, COUNT(book_rating) AS cnt | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 <= COUNT(book_rating) | |
) t1 | |
GROUP BY avg | |
ORDER BY avg | |
" | |
-- Query 5 | |
-- Segmenting the users by Ratings and Age (a pretty big query) | |
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_avg_by_generation.csv " | |
SELECT o1.generation, avg, ROUND(cnt/total_cnt*1000)/10 AS rate | |
FROM | |
( | |
SELECT generation, avg, COUNT(*) AS cnt | |
FROM | |
( | |
SELECT user_id, ROUND(AVG(book_rating)/0.5)*0.5 AS avg, COUNT(book_rating) AS cnt | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 <= COUNT(book_rating) | |
) t1 | |
JOIN | |
( | |
SELECT user_id, CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation | |
FROM users | |
WHERE 10 < IF(age='NULL',0,CAST(age AS INT)) | |
AND IF(age='NULL',0,CAST(age AS INT)) <= 65 | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
GROUP BY generation,avg | |
) o1 | |
JOIN | |
( | |
SELECT generation, COUNT(*) AS total_cnt | |
FROM | |
( | |
SELECT user_id, COUNT(book_rating) | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 <= COUNT(book_rating) | |
) t1 | |
JOIN | |
( | |
SELECT user_id, CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation | |
FROM users | |
WHERE 10 < IF(age='NULL',0,CAST(age AS INT)) | |
AND IF(age='NULL',0,CAST(age AS INT)) <= 65 | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
GROUP BY generation | |
) o2 | |
ON | |
(o1.generation=o2.generation) | |
ORDER BY generation, avg | |
" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment