Last active
August 29, 2015 14:11
-
-
Save jamesthomson/00fcf7b57550d6f03a9d to your computer and use it in GitHub Desktop.
msd song length analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROP TABLE IF EXISTS msd_data; | |
CREATE EXTERNAL TABLE msd_data | |
( | |
ref string, | |
analysis_sample_rate float , | |
artist_7digitalid int , | |
artist_familiarity float , | |
artist_hotttnesss float , | |
artist_id string , | |
artist_latitude float , | |
artist_location string , | |
artist_longitude float , | |
artist_mbid string , | |
artist_mbtags string , | |
artist_mbtags_count int , | |
artist_name string , | |
artist_playmeid int , | |
artist_terms string , | |
artist_terms_freq string , | |
artist_terms_weight string , | |
audio_md5 string , | |
bars_confidence string , | |
bars_start string , | |
beats_confidence string , | |
beats_start string , | |
danceability float , | |
duration float , | |
end_of_fade_in float , | |
energy float , | |
key int , | |
key_confidence float , | |
loudness float , | |
mode int , | |
mode_confidence float , | |
release string , | |
release_7digitalid int , | |
sections_confidence string , | |
sections_start string , | |
segments_confidence string , | |
segments_loudness_max string , | |
segments_loudness_max_time string , | |
segments_loudness_max_start string , | |
segments_pitches string , | |
segments_start string , | |
segments_timbre string , | |
similar_artists string , | |
song_hotttnesss string , | |
song_id string , | |
start_of_fade_out float , | |
tatums_confidence string , | |
tatums_start string , | |
tempo float , | |
time_signature int , | |
time_signature_confidence float , | |
title string , | |
track_id string , | |
year int | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION '/data/files' | |
; | |
/* | |
select * from msd_data limit 100; | |
select count(1) from msd_data; | |
select artist_name, title from msd_data limit 100; | |
*/ | |
DROP TABLE IF EXISTS msd_song_length_breakdown; | |
CREATE EXTERNAL TABLE msd_song_length_breakdown | |
( | |
DURATION_GROUP STRING, | |
COUNT BIGINT | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION 's3n://folder/project/file/filename/msd_song_length_breakdown' | |
; | |
INSERT OVERWRITE TABLE msd_song_length_breakdown | |
select | |
case | |
when duration<=60 then '01DUR_0to1' | |
when duration >60 and duration<=120 then '02DUR_1to2' | |
when duration >120 and duration<=150 then '03DUR_2to2.5' | |
when duration >150 and duration<=180 then '04DUR_2.5to3' | |
when duration >180 and duration<=210 then '05DUR_3to3.5' | |
when duration >210 and duration<=240 then '06DUR_3.5to4' | |
when duration >240 and duration<=270 then '07DUR_4to4.5' | |
when duration >270 and duration<=300 then '08DUR_4.5to5' | |
when duration >270 and duration<=330 then '09DUR_5to6' | |
when duration >330 and duration<=390 then '10DUR_6to7' | |
when duration >390 then '11DUR_7PLUS' | |
else NULL | |
end as DURATION_GROUP, | |
count(1) as count | |
from msd_data | |
group by | |
case | |
when duration<=60 then '01DUR_0to1' | |
when duration >60 and duration<=120 then '02DUR_1to2' | |
when duration >120 and duration<=150 then '03DUR_2to2.5' | |
when duration >150 and duration<=180 then '04DUR_2.5to3' | |
when duration >180 and duration<=210 then '05DUR_3to3.5' | |
when duration >210 and duration<=240 then '06DUR_3.5to4' | |
when duration >240 and duration<=270 then '07DUR_4to4.5' | |
when duration >270 and duration<=300 then '08DUR_4.5to5' | |
when duration >270 and duration<=330 then '09DUR_5to6' | |
when duration >330 and duration<=390 then '10DUR_6to7' | |
when duration >390 then '11DUR_7PLUS' | |
else NULL | |
end; | |
select * from msd_song_length_breakdown order by DURATION_GROUP; | |
DROP TABLE IF EXISTS msd_avg_song_length_by_year; | |
CREATE EXTERNAL TABLE msd_avg_song_length_by_year | |
( | |
year int, | |
avg_duration float, | |
total_songs bigint, | |
DUR_0_1 bigint, | |
DUR_1_2 bigint, | |
DUR_2_2P5 bigint, | |
DUR_2P5_3 bigint, | |
DUR_3_3P5 bigint, | |
DUR_3P5_4 bigint, | |
DUR_4_4P5 bigint, | |
DUR_4P5_5 bigint, | |
DUR_5_6 bigint, | |
DUR_6_7 bigint, | |
DUR_7PLUS bigint | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION 's3n://folder/project/file/filename/msd_avg_song_length_by_year' | |
; | |
INSERT OVERWRITE TABLE msd_avg_song_length_by_year | |
select | |
year, | |
avg(duration), | |
count(1), | |
sum(case when duration<=60 then 1 else 0 end) as DUR_0_1, | |
sum(case when duration >60 and duration<=120 then 1 else 0 end) as DUR_1_2, | |
sum(case when duration >120 and duration<=150 then 1 else 0 end) as DUR_2_2P5, | |
sum(case when duration >150 and duration<=180 then 1 else 0 end) as DUR_2P5_3, | |
sum(case when duration >180 and duration<=210 then 1 else 0 end) as DUR_3_3P5, | |
sum(case when duration >210 and duration<=240 then 1 else 0 end) as DUR_3P5_4, | |
sum(case when duration >240 and duration<=270 then 1 else 0 end) as DUR_4_4P5, | |
sum(case when duration >270 and duration<=300 then 1 else 0 end) as DUR_4P5_5, | |
sum(case when duration >300 and duration<=360 then 1 else 0 end) as DUR_5_6, | |
sum(case when duration >360 and duration<=420 then 1 else 0 end) as DUR_6_7, | |
sum(case when duration >420 then 1 else 0 end) as DUR_7PLUS | |
from msd_data | |
group by year | |
order by year; | |
select * from msd_avg_song_length_by_year; | |
--Genres | |
/* | |
select | |
substr(artist_terms, 1, instr(artist_terms, ",")-1) as first_term, | |
count(1) as freq | |
from msd_sample | |
group by substr(artist_terms, 1, instr(artist_terms, ",")-1) | |
order by freq desc | |
select | |
artist_name, | |
artist_terms, | |
title, | |
genres | |
from msd_sample LATERAL VIEW explode(split(artist_terms, ",")) genretable as genres | |
limit 100; | |
*/ | |
DROP TABLE IF EXISTS msd_avg_song_length_by_genre; | |
CREATE EXTERNAL TABLE msd_avg_song_length_by_genre | |
( | |
genre string, | |
avg_duration float, | |
total_songs bigint, | |
DUR_0_1 bigint, | |
DUR_1_2 bigint, | |
DUR_2_2P5 bigint, | |
DUR_2P5_3 bigint, | |
DUR_3_3P5 bigint, | |
DUR_3P5_4 bigint, | |
DUR_4_4P5 bigint, | |
DUR_4P5_5 bigint, | |
DUR_5_6 bigint, | |
DUR_6_7 bigint, | |
DUR_7PLUS bigint | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION 's3n://folder/project/file/filename/msd_avg_song_length_by_genre' | |
; | |
INSERT OVERWRITE TABLE msd_avg_song_length_by_genre | |
select | |
genres, | |
avg(duration) as duration, | |
count(1) as freq, | |
sum(case when duration<=60 then 1 else 0 end) as DUR_0_1, | |
sum(case when duration >60 and duration<=120 then 1 else 0 end) as DUR_1_2, | |
sum(case when duration >120 and duration<=150 then 1 else 0 end) as DUR_2_2P5, | |
sum(case when duration >150 and duration<=180 then 1 else 0 end) as DUR_2P5_3, | |
sum(case when duration >180 and duration<=210 then 1 else 0 end) as DUR_3_3P5, | |
sum(case when duration >210 and duration<=240 then 1 else 0 end) as DUR_3P5_4, | |
sum(case when duration >240 and duration<=270 then 1 else 0 end) as DUR_4_4P5, | |
sum(case when duration >270 and duration<=300 then 1 else 0 end) as DUR_4P5_5, | |
sum(case when duration >300 and duration<=360 then 1 else 0 end) as DUR_5_6, | |
sum(case when duration >360 and duration<=420 then 1 else 0 end) as DUR_6_7, | |
sum(case when duration >420 then 1 else 0 end) as DUR_7PLUS | |
from msd_data LATERAL VIEW explode(split(artist_terms, ",")) genretable as genres | |
group by genres | |
--order by duration desc | |
; | |
select * from msd_avg_song_length_by_genre limit 100; | |
--artist lifestage | |
DROP TABLE IF EXISTS msd_artist_career_stage_7min; | |
CREATE EXTERNAL TABLE msd_artist_career_stage_7min | |
( | |
artist_name string, | |
songs bigint, | |
first_song_year bigint, | |
last_song_year bigint, | |
years_active bigint, | |
songs_over_7mins bigint, | |
prop_songs_over_7mins float, | |
first_song_year_over_7mins bigint, | |
last_song_year_over_7mins bigint, | |
perc_into_career_first_record_7min_song float | |
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
STORED AS TEXTFILE | |
LOCATION 's3n://folder/project/file/filename/msd_artist_career_stage_7min' | |
; | |
INSERT OVERWRITE TABLE msd_artist_career_stage_7min | |
select artist_name, | |
count(1) as songs, | |
min(year) as first_song_year, | |
max(year) as last_song_year, | |
max(year)-min(year) as years_active, | |
sum(case when duration >420 then 1 else 0 end) as songs_over_7mins, | |
sum(case when duration >420 then 1 else 0 end)/count(1) as prop_songs_over_7mins, | |
min(case when duration >420 then year end) as first_song_year_over_7mins, | |
max(case when duration >420 then year end) as last_song_year_over_7mins, | |
case | |
when min(case when duration >420 then year end)=min(year) then 0.0 | |
else (min(case when duration >420 then year end)-min(year))/(max(year)-min(year)) | |
end as perc_into_career_first_record_7min_song | |
from msd_data | |
where year<>0 | |
group by artist_name | |
; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment