Skip to content

Instantly share code, notes, and snippets.

@kiyoto
Created July 23, 2012 22:13
Show Gist options
  • Save kiyoto/3166578 to your computer and use it in GitHub Desktop.
Save kiyoto/3166578 to your computer and use it in GitHub Desktop.
Book Crossing Dataset Action Queries
-- Query 1: Counting Harsh, Generous and Lazy
td query -w -d book_crossing_dataset "
SELECT rating_type, COUNT(*) As cnt
FROM
(
SELECT user_id, MIN(book_rating) AS stat, COUNT(book_rating) AS cnt, 'Generous' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND 9 <= MIN(book_rating)
UNION ALL
SELECT user_id, MAX(book_rating) AS stat, COUNT(book_rating) AS cnt, 'Harsh' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND MAX(book_rating) <= 5
UNION ALL
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS stat, COUNT(book_rating) AS cnt, 'Lazy' AS rating_type
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND VARIANCE(book_rating) = 0
) t1
GROUP BY rating_type
ORDER BY rating_type
"
-- Query 2: Distribution of lazy users.
td query -w -d book_crossing_dataset "
SELECT book_rating, COUNT(*) AS cnt
FROM
(
SELECT user_id, CAST(VARIANCE(book_rating) AS INT) AS var, COUNT(book_rating) AS cnt, AVG(book_rating) AS book_rating
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND VARIANCE(book_rating) = 0
) t1
GROUP BY book_rating
ORDER BY book_rating
"
-- Query 3: Books impacted the most by the Generous
td query -w -d book_crossing_dataset -f csv -o ranking_of_books_influenced_by_overestimated_users.csv "
SELECT total.isbn,
title,
total.cnt AS total_cnt,
outlier.cnt AS outlier_cnt,
total.s AS total_s,
outlier.s AS outlier_s,
total.s-outlier.s AS diff_s,
total.cnt-outlier.cnt AS diff_cnt,
total.avg AS total_avg,
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS eliminated_avg,
total.avg - IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS diff_avg
FROM
(
SELECT t1.isbn AS isbn, title, cnt, s, avg
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, SUM(book_rating) AS s
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 10 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn, v['book_title'] AS title
FROM books
) t2
ON
(t1.isbn=t2.isbn)
) total
JOIN
(
SELECT t2.isbn AS isbn, COUNT(book_rating) AS cnt, SUM(book_rating) AS s
FROM
(
SELECT user_id, MIN(book_rating) AS mn, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND 8 <= MIN(book_rating)
) t1
JOIN
(
SELECT isbn, user_id, book_rating
FROM ratings
WHERE 0 < book_rating
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY t2.isbn
) outlier
ON
(total.isbn=outlier.isbn)
ORDER BY diff_avg DESC
LIMIT 20
"
-- Query 4: Books impacted the most by the Harsh
td query -w -d book_crossing_dataset -f csv -o ranking_of_books_influenced_by_underestimated_users.csv "
SELECT total.isbn,
title,
total.cnt AS total_cnt,
outlier.cnt AS outlier_cnt,
total.s AS total_s,
outlier.s AS outlier_s,
total.s-outlier.s AS diff_s,
total.cnt-outlier.cnt AS diff_cnt,
total.avg AS total_avg,
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) AS eliminated_avg,
IF(total.cnt=outlier.cnt,total.avg,(total.s-outlier.s)/(total.cnt-outlier.cnt)) - total.avg AS diff_avg
FROM
(
SELECT t1.isbn AS isbn, title, cnt, s, avg
FROM
(
SELECT isbn, AVG(book_rating) AS avg, COUNT(book_rating) AS cnt, SUM(book_rating) AS s
FROM ratings
WHERE 0 < book_rating
GROUP BY isbn
HAVING 10 <= COUNT(book_rating)
) t1
JOIN
(
SELECT isbn, v['book_title'] AS title
FROM books
) t2
ON
(t1.isbn=t2.isbn)
) total
JOIN
(
SELECT t2.isbn AS isbn, COUNT(book_rating) AS cnt, SUM(book_rating) AS s
FROM
(
SELECT user_id, MAX(book_rating) AS mn, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 < COUNT(book_rating)
AND MAX(book_rating) <= 6
) t1
JOIN
(
SELECT isbn, user_id, book_rating
FROM ratings
WHERE 0 < book_rating
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY t2.isbn
) outlier
ON
(total.isbn=outlier.isbn)
ORDER BY diff_avg DESC
LIMIT 20
"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment