Created
July 23, 2012 22:13
-
-
Save kiyoto/3166578 to your computer and use it in GitHub Desktop.
Book Crossing Dataset Action Queries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Query 1: Counting Harsh, Generous and Lazy | |
td query -w -d book_crossing_dataset " | |
SELECT rating_type, COUNT(*) As cnt | |
FROM | |
( | |
SELECT user_id, MIN(book_rating) AS stat, COUNT(book_rating) AS cnt, 'Generous' AS rating_type | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND 9 <= MIN(book_rating) | |
UNION ALL | |
SELECT user_id, MAX(book_rating) AS stat, COUNT(book_rating) AS cnt, 'Harsh' AS rating_type | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND MAX(book_rating) <= 5 | |
UNION ALL | |
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS stat, COUNT(book_rating) AS cnt, 'Lazy' AS rating_type | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND VARIANCE(book_rating) = 0 | |
) t1 | |
GROUP BY rating_type | |
ORDER BY rating_type | |
" | |
-- Query 2: Distribution of lazy users. | |
td query -w -d book_crossing_dataset " | |
SELECT book_rating, COUNT(*) AS cnt | |
FROM | |
( | |
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS var, COUNT(book_rating) AS cnt, AVG(book_rating) AS book_rating | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND VARIANCE(book_rating) = 0 | |
) t1 | |
GROUP BY book_rating | |
ORDER BY book_rating | |
" | |
-- Query 3: Books impacted the most by the Generous | |
td query -w -d book_crossing_dataset -f csv -o ranking_of_books_influenced_by_overestimated_users.csv " | |
SELECT total.isbn, | |
title, | |
total.cnt AS total_cnt, | |
outlier.cnt AS outlier_cnt, | |
total.s AS total_s, | |
outlier.s AS outlier_s, | |
total.s-outlier.s AS diff_s, | |
total.cnt-outlier.cnt AS diff_cnt, | |
total.avg AS total_avg, | |
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS eliminated_avg, | |
total.avg - IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS diff_avg | |
FROM | |
( | |
SELECT t1.isbn AS isbn, title, cnt, s, avg | |
FROM | |
( | |
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, SUM(book_rating) AS s | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY isbn | |
HAVING 10 <= COUNT(book_rating) | |
) t1 | |
JOIN | |
( | |
SELECT isbn, v['book_title'] AS title | |
FROM books | |
) t2 | |
ON | |
(t1.isbn=t2.isbn) | |
) total | |
JOIN | |
( | |
SELECT t2.isbn AS isbn, COUNT(book_rating) AS cnt, SUM(book_rating) AS s | |
FROM | |
( | |
SELECT user_id, MIN(book_rating) AS mn, COUNT(book_rating) AS cnt | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND 8 <= MIN(book_rating) | |
) t1 | |
JOIN | |
( | |
SELECT isbn, user_id, book_rating | |
FROM ratings | |
WHERE 0 < book_rating | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
GROUP BY t2.isbn | |
) outlier | |
ON | |
(total.isbn=outlier.isbn) | |
ORDER BY diff_avg DESC | |
LIMIT 20 | |
" | |
-- Query 4: Books impacted the most by the Harsh | |
td query -w -d book_crossing_dataset -f csv -o ranking_of_books_influenced_by_underestimated_users.csv " | |
SELECT total.isbn, | |
title, | |
total.cnt AS total_cnt, | |
outlier.cnt AS outlier_cnt, | |
total.s AS total_s, | |
outlier.s AS outlier_s, | |
total.s-outlier.s AS diff_s, | |
total.cnt-outlier.cnt AS diff_cnt, | |
total.avg AS total_avg, | |
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS eliminated_avg, | |
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) - total.avg AS diff_avg | |
FROM | |
( | |
SELECT t1.isbn AS isbn, title, cnt, s, avg | |
FROM | |
( | |
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, SUM(book_rating) AS s | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY isbn | |
HAVING 10 <= COUNT(book_rating) | |
) t1 | |
JOIN | |
( | |
SELECT isbn, v['book_title'] AS title | |
FROM books | |
) t2 | |
ON | |
(t1.isbn=t2.isbn) | |
) total | |
JOIN | |
( | |
SELECT t2.isbn AS isbn, COUNT(book_rating) AS cnt, SUM(book_rating) AS s | |
FROM | |
( | |
SELECT user_id, MAX(book_rating) AS mn, COUNT(book_rating) AS cnt | |
FROM ratings | |
WHERE 0 < book_rating | |
GROUP BY user_id | |
HAVING 5 < COUNT(book_rating) | |
AND MAX(book_rating) <= 6 | |
) t1 | |
JOIN | |
( | |
SELECT isbn, user_id, book_rating | |
FROM ratings | |
WHERE 0 < book_rating | |
) t2 | |
ON | |
(t1.user_id=t2.user_id) | |
GROUP BY t2.isbn | |
) outlier | |
ON | |
(total.isbn=outlier.isbn) | |
ORDER BY diff_avg DESC | |
LIMIT 20 | |
" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment