Skip to content

Instantly share code, notes, and snippets.

@kiyoto
Created July 16, 2012 23:37
Show Gist options
  • Save kiyoto/3125858 to your computer and use it in GitHub Desktop.
Save kiyoto/3125858 to your computer and use it in GitHub Desktop.
Book Crossing Dataset Status Queries
-- Queries 1
td query -w -d book_crossing_dataset "
SELECT t AS type, cnt
FROM
(
SELECT COUNT(*) AS cnt, 'only in users' AS t
FROM
(
SELECT user_id
FROM users
GROUP BY user_id
) t1
LEFT OUTER JOIN
(
SELECT user_id
FROM ratings
GROUP BY user_id
) t2
ON
(t1.user_id=t2.user_id)
WHERE t2.user_id IS NULL
UNION ALL
SELECT COUNT(*) AS cnt, 'only in ratings' AS t
FROM
(
SELECT user_id
FROM ratings
GROUP BY user_id
) t1
LEFT OUTER JOIN
(
SELECT user_id
FROM users
GROUP BY user_id
) t2
ON
(t1.user_id=t2.user_id)
WHERE t2.user_id IS NULL
UNION ALL
SELECT COUNT(*) AS cnt, 'both' AS t
FROM
(
SELECT user_id
FROM ratings
GROUP BY user_id
) t1
JOIN
(
SELECT user_id
FROM users
GROUP BY user_id
) t2
ON
(t1.user_id=t2.user_id)
) o1
"
td query -w -d book_crossing_dataset "
SELECT t1.cnt AS all_books, t2.cnt AS active_books, ROUND(t2.cnt/t1.cnt*100) AS active_rate
FROM
(
SELECT COUNT(distinct isbn) as cnt, 1 AS one
FROM books
) t1
JOIN
(
SELECT COUNT(distinct isbn) as cnt, 1 AS one
FROM ratings
) t2
ON
(t1.one=t2.one)
"
td query -w -d book_crossing_dataset "
SELECT t AS type, cnt
FROM
(
SELECT COUNT(*) AS cnt, 'only in books' AS t
FROM
(
SELECT isbn
FROM books
GROUP BY isbn
) t1
LEFT OUTER JOIN
(
SELECT isbn
FROM ratings
GROUP BY isbn
) t2
ON
(t1.isbn=t2.isbn)
WHERE t2.isbn IS NULL
UNION ALL
SELECT COUNT(*) AS cnt, 'only in ratings' AS t
FROM
(
SELECT isbn
FROM ratings
GROUP BY isbn
) t1
LEFT OUTER JOIN
(
SELECT isbn
FROM books
GROUP BY isbn
) t2
ON
(t1.isbn=t2.isbn)
WHERE t2.isbn IS NULL
UNION ALL
SELECT COUNT(*) AS cnt, 'both' AS t
FROM
(
SELECT isbn
FROM ratings
GROUP BY isbn
) t1
JOIN
(
SELECT isbn
FROM books
GROUP BY isbn
) t2
ON
(t1.isbn=t2.isbn)
) o1
"
-- Queries 2
-- Per country
td query -w -d book_crossing_dataset -f csv -o user_dist_of_country.csv "
SELECT country, cnt, ROUND(cnt/total_cnt*10000)/100 AS rate
FROM
(
SELECT country, COUNT(*) as cnt, 1 AS one
FROM users
WHERE country != '' AND country != 'n/a'
GROUP BY country
ORDER BY cnt DESC
LIMIT 30
) t1
JOIN
(
SELECT COUNT(country) as total_cnt, 1 AS one
FROM users
WHERE country != '' AND country != 'n/a'
) t2
ON
(t1.one=t2.one)
"
-- Per state in the US
td query -w -d book_crossing_dataset -f csv -o user_dist_of_location.csv "
SELECT location1 AS state, cnt, ROUND(cnt/total_cnt*10000)/100 AS rate
FROM
(
SELECT location1, COUNT(*) as cnt, 1 AS one
FROM users
WHERE country = 'usa'
GROUP BY location1
ORDER BY cnt DESC
LIMIT 30
) t1
JOIN
(
SELECT COUNT(*) as total_cnt, 1 AS one
FROM users
WHERE country = 'usa'
) t2
ON
(t1.one=t2.one)
"
-- Queries 3
td query -w -d book_crossing_dataset "
SELECT generation, cnt
FROM
(
SELECT CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation, COUNT(*) as cnt
FROM users
WHERE IF(age='NULL',0,CAST(age AS INT)) <= 65
GROUP BY CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5
UNION ALL
SELECT CAST(65 AS BIGINT) AS generation, COUNT(*) as cnt
FROM users
WHERE 65 < CAST(age AS INT)
) t1
ORDER BY generation
"
-- filtered histogram
td query -w -d book_crossing_dataset -f csv -o user_dist_of_generation.csv "
SELECT generation, cnt, ROUND(cnt/total_cnt*1000)/10 AS rate
FROM
(
SELECT CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation, COUNT(*) as cnt, 1 AS one
FROM users
WHERE 10 < IF(age='NULL',0,CAST(age AS INT))
AND IF(age='NULL',0,CAST(age AS INT)) <= 65
GROUP BY CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5
) t1
JOIN
(
SELECT COUNT(*) as total_cnt, 1 AS one
FROM users
WHERE 10 < IF(age='NULL',0,CAST(age AS INT))
AND IF(age='NULL',0,CAST(age AS INT)) <= 65
) t2
ON
(t1.one=t2.one)
ORDER BY generation
"
-- Queries 4
-- Average # of reviews per user
td query -w -d book_crossing_dataset "
SELECT ROUND(AVG(cnt)) AS avg
FROM
(
SELECT user_id, COUNT(book_rating) AS cnt
FROM ratings
GROUP BY user_id
) t1
"
-- Distribution of the review ratings
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_count.csv "
SELECT cnt, user_cnt, ROUND(user_cnt/total_user_cnt*1000)/10 AS rate
FROM
(
SELECT cnt, COUNT(*) AS user_cnt, 1 AS one
FROM
(
SELECT user_id, COUNT(book_rating) AS cnt
FROM ratings
GROUP BY user_id
) t1
GROUP BY cnt
ORDER BY cnt
LIMIT 50
) o1
JOIN
(
SELECT COUNT(distinct user_id) AS total_user_cnt, 1 AS one
FROM ratings
) o2
ON
(o1.one=o2.one)
"
-- (0<book_rating, 5<=count(book_rating)) --
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_avg.csv "
SELECT avg, COUNT(*)
FROM
(
SELECT user_id, ROUND(AVG(book_rating)/0.5)*0.5 AS avg, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 <= COUNT(book_rating)
) t1
GROUP BY avg
ORDER BY avg
"
-- Query 5
-- Segmenting the users by Ratings and Age (a pretty big query)
td query -w -d book_crossing_dataset -f csv -o review_user_dist_of_review_avg_by_generation.csv "
SELECT o1.generation, avg, ROUND(cnt/total_cnt*1000)/10 AS rate
FROM
(
SELECT generation, avg, COUNT(*) AS cnt
FROM
(
SELECT user_id, ROUND(AVG(book_rating)/0.5)*0.5 AS avg, COUNT(book_rating) AS cnt
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 <= COUNT(book_rating)
) t1
JOIN
(
SELECT user_id, CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation
FROM users
WHERE 10 < IF(age='NULL',0,CAST(age AS INT))
AND IF(age='NULL',0,CAST(age AS INT)) <= 65
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY generation,avg
) o1
JOIN
(
SELECT generation, COUNT(*) AS total_cnt
FROM
(
SELECT user_id, COUNT(book_rating)
FROM ratings
WHERE 0 < book_rating
GROUP BY user_id
HAVING 5 <= COUNT(book_rating)
) t1
JOIN
(
SELECT user_id, CEIL(IF(age='NULL',0,CAST(age AS INT))/5)*5 AS generation
FROM users
WHERE 10 < IF(age='NULL',0,CAST(age AS INT))
AND IF(age='NULL',0,CAST(age AS INT)) <= 65
) t2
ON
(t1.user_id=t2.user_id)
GROUP BY generation
) o2
ON
(o1.generation=o2.generation)
ORDER BY generation, avg
"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment