Forked from hannesdatta/deprecated-classify-labels.R
Last active
March 25, 2020 09:00
-
-
Save hannesdatta-research/aac9b1ddf8e69794f3f402292cd76b25 to your computer and use it in GitHub Desktop.
Classifying music labels into major- and independent labels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################# | |
# # | |
# Classify music labels # | |
# into major labels (Sony, Warner, Universal), # | |
# and independent labels (all others) # | |
# # | |
# Maintained by: # | |
# [email protected] # | |
# # | |
# COMMENTS AND FEEDBACK WELCOME! # | |
# # | |
# Unzip labels.zip first before using script. # | |
################################################# | |
# In the script below, I use a series of | |
# regular expressions (for an extensive tutorial, | |
# see https://www.hackerearth.com/practice/machine-learning/advanced-techniques/regular-expressions-string-manipulation-r/tutorial/) | |
# to identify major labels (versus independent | |
# labels in a list of label names (as available on | |
# Spotify. | |
labels = read.table('labels.csv', header = T, quote="", encoding = "UTF-8", sep="\t") | |
labels_warner = c('Warner Music' = 'warner[ ]music|warner[ ]records|warner[ ]home|warner[ ]special|warner[ ]strategic|warner[.]esp|Warner[ ][A-Z]', | |
'Asylum Records' = '^asylum$|asylum[/]|[/]asylum|asylum[ ]records|([|]|^)atlantic[ ]records|elektra asylum|atlantic[ ]|([|]|^)atlantic([|]|$)|elektra[ ]records|[/]elektra|elektra[/]|([|]|^)elektra([|]|$)|warner[ ]music[ ]nashville|warner[ ]bros|elektra[ ]nashville', | |
'Big Beats Records' = 'big[ ]beat[ ]|([|]|^)big beat([|]|$)|[/]big beat|big beat[/]|big beat$', | |
'Canvasback Music' = 'canvasback', | |
'Parlophone Label Group' = 'parlophone|FFR[ ]records|([|]|^)FFRR([|]|$)|virgin[ ]classics|emi[ ]classics|[ ]erato[ ]|([|]|^)erato|warner[ ]classics|([|]|^)erato[ ]|[/]erato|erato[/]', | |
'Reprise Records' = '([|]|^)reprise[ ]|[/]reprise|reprise[/]', | |
'Fueled By Ramen' = 'Fueled[ ]by[ ]ramen', | |
'Nonesuch Records' = 'nonesuch[ ]records|([|]|^)nonesuch([|]|$)|[/]nonesuch|nonesuch[/]', | |
'Rhino Entertainment' = 'rhino[ ]entertainment|([|]|^)rhino|[/]rhino|rhino[/]', | |
'Roadrunner Records' = '([|]|^)roadrunner|[/]roadrunner|roadrunner[/]', | |
'Sire Records' = '([|]|^)sire[ ]records|([|]|^)sire[ ]|([|]|^)sire[ ]([|]|$)|[/]sire|sire[/]', | |
'East West' = 'east[ ]west|eastwest', | |
'Warner (all combined)' = '([|]|^)warner|[(]warner[)]|asylum[ ]records|big[ ]beat[ ]records|canvasback[ ]music|parlophone[ ]label[ ]group|reprise[ ]records|fueled[ ]by[ ]ramen|nonesuch[ ]records|rhino[ ]entertainment|roadrunner[ ]records|sire[ ]records|east[ ]west') | |
labels_universal = c('Universal Music Group' = '([|]|^)universal|[/]universal|universal[/]|([|]|^)universal[ ]music[ ]japan|([|]|^)universal[ ]sigma|([|]|^)universal[ ]international|([|]|^)geneon[ ]universal|nbcuniversal|universal[ ]licensing[ ]music|([|]|^)universal[ ]music[ ]|universal[ ]music[ ]spain|universal[ ]m..z.k|([|]|^)universal records|([|]|^)universal[ ]records[ ]|([|]|^)universal[ ]republic[ ]records', | |
'Capitol Music Group' = 'capitol|astralwerks|blue[ ]{0,2}note|([|]|^)caroline[ ]|deep[ ]{0,2}well|([|]|^)harvest|([|]|^)metamorphosis|motown|quality[ ]{0,2}control|([|]|^)virgin[ ]|([|]|^)virgin([|]|$)|[/]virgin|virgin[/]', | |
'Decca Classics' = 'decca|ecm([ ]|$|[|])|([|]|^)mercury|([|]|^)mercury[ ]classics|([|]|^)mercury[ ]records|mercury[ ]records', | |
'Def Jam Recordings' = 'def[ ]{0,2}jam|artium|g.o.o.d|([|]|^)good([|]|$)|good[ ]records', | |
'Deutsche Grammophon' = 'deutsche[ ]grammophon|grammophon', | |
'Eagle Rock Entertainment' = 'eagle[ ]rock|eagle[ ]records', | |
'EMI' = 'emi[ ]|([|]|^)emi|([|]|^)emi[ ]music|emi[-]|[/]emi|emi[/]', | |
'Interscope' = 'interscope|geffen|A[&]M|([|]|^)222([|]|$)|aftermath|darkroom|dreamville|LVRN Records|Mad Love|insomniac[ ]|kidinakorner|shady[ ]{0,2}records|([|]|^)shady', | |
'Island Records' = '([|]|^)island[ ]records|4th & Broadway|universal[ ]island|([|]|^)island[ ]|([|]|^)island([|]|$)|[/]island|island[/]', | |
'Polydor Records' = '([|]|^)polydor[ ]|([|]|^)polydor[ ]([|]|$)|([|]|^)fiction[ ]records|([|]|^)fiction([|]|$)|polydor', | |
'Republic Records' = '([|]|^)republic[ ]records|[/]republic|republic[/]|universal[ ]republic|([|]|^)american[ ]recordings|([|]|^)Brushfire[ ]records|([|]|^)casablanca[ ]records|([|]|^)cash[ ]money[ ]records|john[ ]varvatos|([|]|^)lava[ ]records|lightworkers|([|]|^)the[ ]voice', | |
'Republic Records 2' = '([|]|^)republic[ ]records|([|]|^)american[ ]recordings|([|]|^)Brushfire[ ]records|([|]|^)casablanca[ ]records|([|]|^)cash[ ]money[ ]records|john[ ]varvatos|([|]|^)lava[ ]records|lightworkers|([|]|^)the[ ]voice', | |
'Universal Music Enterprises' = '([|]|^)universal[ ]|([|]|^)universal([|]|$)|T[-]{0,1}boy', | |
'Universal Music Group Nashville' = 'capitol[ ]{0,1}records[ ]{0,1}nashville|emi[ ]{0,1}records[ ]{0,1}nashville|mca[ ]{0,1}nashville|mercury[ ]{0,1}nashville|show[-]{0,1}dog', | |
'Universal Music Latin Entertainment' = 'capitol[ ]{0,1}latin|disa[ ]{0,1}records|fonovisa|machete[ ]{0,1}music|universal[ ]{0,1}music[ ]{0,1}latino', | |
'Verve Label Group' = 'verve[ ]label[ ]group|verve[/]|[/]verve|verve[ ]{0,1}records|decca[ ]{0,1}gold|universal[ ]{0,1}music[ ]{0,1}classics|decca[ ]{0,1}broadway|verve[ ]{0,1}group|([|]|&)verve([|]|$)|verve[ ]{0,1}music[ ]{0,1}group', | |
'PM:AM Recordings' = 'PM[:]AM|pm[ ]{0,1}am', | |
'Spinefarm Records' = 'spinefarm', | |
'SpinnUp' = 'SpinnUp', | |
'Disques Barclay' = 'disques[ ]{0,1}barclay|([|]|^)barclay', | |
'Varese Sarabande' = 'var.se', | |
'Digital Distribution Trinidad and Tobago' ='Digital Distribution Trinidad and Tobago', | |
'Universal Music (combined)' = 'varese[ ]sarabande|disques[ ]barclay|spinnup|spinefarm[ ]records|pm:am[ ]recordings|verve[ ]label[ ]group|universal[ ]music[ ]latin[ ]entertainment|universal[ ]music[ ]group[ ]nashville|universal[ ]music[ ]enterprises|republic[ ]records|polydor[ ]records|island[ ]records|interscope|emi|eagle[ ]rock[ ]entertainment|deutsche[ ]grammophon|def[ ]jam[ ]recordings|decca[ ]classics|capitol[ ]music[ ]group|universal[ ]music[ ]group', | |
'Abbey Road' = 'abbey[ ]road|bravad', | |
'others' = 'xo records|young money|shady records|spinefarm rec|u[-]live') | |
labels_sony = c('Columbia Records'='CBS[ ]columbia|([|]|^)columbia|hypnotize[ ]minds|[/]columbia|columbia[/]', | |
'Columbia Records 2' = 'dreamville[ ]entertainment|small.*giant|startime[ ]international|blue[ ]propaganda', | |
'RCA Records' = '([|]|^)rca|([|]|^)bystorm.*entertainment|([|]|^)nappy[ ]boy|([|]|^)j[ ]records', | |
'Epic Records' = '([|]|^)epic|([|]|^)battery|([|]|^)freebandz|([|]|^)bad[ ]{0,1}boy[ ]records|([|]|^)volcano|vested[ ]in[ ]culture', | |
'Sony Music Nashville' = 'sony[ ]music|([|]|^)arista|([|]|^)columbia[ ]nashville|rca[ ]records[ ]nashville', | |
'Zomba Music Group' = '([|]|^)zomba|([|]|^)jive[ ]records|([|]|^)verity|([|]|^)silvertone', | |
'RED Music Distribution' = '([|]|^)red[ ]music[ ]|odd[ ]future|([|]|^)red[ ]ink|cinematic[ ]music|([|]|^)reach[ ]records', | |
'Legacy Recordings' = '([|]|^)legacy[ ]recordings|([|]|^)laface', | |
'Sony Music Latin' = 'sony.*latin', | |
'Ariola Records' = 'ariola', | |
'Sony Masterworks' = 'sony[ ]masterworks|([|]|^)bluebird|([|]|^)okeh|portrait[ ]records|([|]|^)portrait|([|]|^)arte[ ]nova|sony[ ]classical|flying[ ]buddha|([|]|^)masterworks', | |
'Provident Label Group' = '([|]|^)provident|essential[ ]records|flicker[ ]records|beach[ ]street|reunion[ ]records|essential[ ]worship', | |
'Century Media Records' = 'century[ ]media|([|]|^)century record|people[ ]like[ ]you|insideout[ ]music|superball[ ]music', | |
'Sony Music Entertainment' = 'Sony[ ]BMG|([|]|^)BMG|columbia[ ]music|sony[ ]music|Columbia[ ]records|RCA[ ]Records|Epic[ ]Records|Sony[ ]Music[ ]Nashville|Zomba[ ]Music[ ]Group|RED[ ]Music[ ]Distribution[ ]|Legacy[ ]Recordings|Sony[ ]Music[ ]Latin|Ariola[ ]Records|Sony[ ]Masterworks|Provident[ ]Label[ ]Group|Century[ ]Media[ ]Records') | |
label_iter=list(warner=labels_warner, universal=labels_universal, sony=labels_sony) | |
labels$label_classified <- "" | |
for (lbl in names(label_iter)) { | |
for (l in label_iter[[lbl]]) { | |
labels[grepl(l, labels$label, ignore.case=TRUE),]$label_classified = lbl | |
} | |
} | |
write.table(labels, "labels-classified.csv", row.names=F) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment