Last active
July 23, 2020 06:05
-
-
Save hannesdatta/19508cba3ab80bf0b2a648bec2480d0e to your computer and use it in GitHub Desktop.
Classifying music labels into major- and independent labels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This gist has been replaced by an R package with an updated list of labels. | |
Get it on GitHub: https://github.com/hannesdatta/musicMetadata | |
LEGACY CODE | |
################################################# | |
# # | |
# Classify music labels # | |
# into major labels (Sony, Warner, Universal), # | |
# and independent labels (all others) # | |
# # | |
# Maintained by: # | |
# [email protected] # | |
# # | |
# COMMENTS AND FEEDBACK WELCOME! # | |
# # | |
# Acknowledgements: # | |
# Thanks to Robbert Oudelaar # | |
# (https://tinyurl.com/r36u6ly) for debugging # | |
# # | |
# # | |
# # | |
# # | |
################################################# | |
# In the script below, I use a series of | |
# regular expressions (for an extensive tutorial, | |
# see https://www.hackerearth.com/practice/machine-learning/advanced-techniques/regular-expressions-string-manipulation-r/tutorial/) | |
# to identify major labels (versus independent | |
# labels in a list of label names (as available on | |
# Spotify. | |
# For a guide how to contribiute to this Gist, see http://tilburgsciencehub.com/workflow/collaboration/ | |
labels = read.table(unz('labels.zip', 'labels.csv'), header = T, quote="", encoding = "UTF-8", sep="\t") | |
labels_warner = c('Warner Music' = 'warner[ ]music|warner[ ]records|warner[ ]home|warner[ ]special|warner[ ]strategic|warner[.]esp|Warner[ ][A-Z]', | |
'Asylum Records' = '^asylum$|asylum[/]|[/]asylum|asylum[ ]records|([|]|^)atlantic[ ]records|elektra asylum|atlantic[ ]|([|]|^)atlantic([|]|$)|elektra[ ]records|[/]elektra|elektra[/]|([|]|^)elektra([|]|$)|warner[ ]music[ ]nashville|warner[ ]bros|elektra[ ]nashville', | |
'Big Beats Records' = 'big[ ]beat[ ]|([|]|^)big beat([|]|$)|[/]big beat|big beat[/]|big beat$', | |
'Canvasback Music' = 'canvasback', | |
'Parlophone Label Group' = 'parlophone|FFR[ ]records|([|]|^)FFRR([|]|$)|virgin[ ]classics|emi[ ]classics|[ ]erato[ ]|([|]|^)erato|warner[ ]classics|([|]|^)erato[ ]|[/]erato|erato[/]', | |
'Reprise Records' = '([|]|^)reprise[ ]|[/]reprise|reprise[/]', | |
'Fueled By Ramen' = 'Fueled[ ]by[ ]ramen', | |
'Nonesuch Records' = 'nonesuch[ ]records|([|]|^)nonesuch([|]|$)|[/]nonesuch|nonesuch[/]', | |
'Rhino Entertainment' = 'rhino[ ]entertainment|([|]|^)rhino|[/]rhino|rhino[/]', | |
'Roadrunner Records' = '([|]|^)roadrunner|[/]roadrunner|roadrunner[/]', | |
'Sire Records' = '([|]|^)sire[ ]records|([|]|^)sire[ ]|([|]|^)sire[ ]([|]|$)|[/]sire|sire[/]', | |
'East West' = 'east[ ]west|eastwest', | |
'Warner (all combined)' = '([|]|^)warner|[(]warner[)]|asylum[ ]records|big[ ]beat[ ]records|canvasback[ ]music|parlophone[ ]label[ ]group|reprise[ ]records|fueled[ ]by[ ]ramen|nonesuch[ ]records|rhino[ ]entertainment|roadrunner[ ]records|sire[ ]records|east[ ]west', | |
'WM' = 'WM[ ]|WM[/]|[/]WMI|[ ]WMI|WMI[ ]') | |
labels_universal = c('Universal Music Group' = '([|]|^)universal|[/]universal|universal[/]|([|]|^)universal[ ]music[ ]japan|([|]|^)universal[ ]sigma|([|]|^)universal[ ]international|([|]|^)geneon[ ]universal|nbcuniversal|universal[ ]licensing[ ]music|([|]|^)universal[ ]music[ ]|universal[ ]music[ ]spain|universal[ ]m..z.k|([|]|^)universal records|([|]|^)universal[ ]records[ ]|([|]|^)universal[ ]republic[ ]records', | |
'Capitol Music Group' = 'capitol|astralwerks|blue[ ]{0,2}note|([|]|^)caroline[ ]|deep[ ]{0,2}well|([|]|^)harvest|([|]|^)metamorphosis|motown|quality[ ]{0,2}control|([|]|^)virgin[ ]|([|]|^)virgin([|]|$)|[/]virgin|virgin[/]', | |
'Decca Classics' = 'decca|ecm([ ]|$|[|])|([|]|^)mercury|([|]|^)mercury[ ]classics|([|]|^)mercury[ ]records|mercury[ ]records', | |
'Def Jam Recordings' = 'def[ ]{0,2}jam|artium|g.o.o.d|([|]|^)good([|]|$)|good[ ]records', | |
'Deutsche Grammophon' = 'deutsche[ ]grammophon|grammophon', | |
'Eagle Rock Entertainment' = 'eagle[ ]rock|eagle[ ]records', | |
#original EMI 'EMI' = 'emi[ ]|([|]|^)emi|([|]|^)emi[ ]music|emi[-]|[/]emi|emi[/]', | |
'EMI' = 'emi[ ]|([|]|^)emi[ ]music|[/]emi|emi[/]|[-]emi|([|]|^)emi([|]|$)|[ ]emi([|]|$)', | |
'Interscope' = 'interscope|geffen|A[&]M|([|]|^)222([|]|$)|aftermath|darkroom|dreamville|LVRN Records|Mad Love|insomniac[ ]|kidinakorner|shady[ ]{0,2}records|([|]|^)shady', | |
'Island Records' = '([|]|^)island[ ]records|4th & Broadway|universal[ ]island|([|]|^)island[ ]|([|]|^)island([|]|$)|[/]island|island[/]', | |
'Polydor Records' = '([|]|^)polydor[ ]|([|]|^)polydor[ ]([|]|$)|([|]|^)fiction[ ]records|([|]|^)fiction([|]|$)|polydor', | |
'Republic Records' = '([|]|^)republic[ ]records|[/]republic|republic[/]|universal[ ]republic|([|]|^)american[ ]recordings|([|]|^)Brushfire[ ]records|([|]|^)casablanca[ ]records|([|]|^)cash[ ]money[ ]records|john[ ]varvatos|([|]|^)lava[ ]records|lightworkers|([|]|^)the[ ]voice', | |
'Republic Records 2' = '([|]|^)republic[ ]records|([|]|^)american[ ]recordings|([|]|^)Brushfire[ ]records|([|]|^)casablanca[ ]records|([|]|^)cash[ ]money[ ]records|john[ ]varvatos|([|]|^)lava[ ]records|lightworkers|([|]|^)the[ ]voice', | |
'Universal Music Enterprises' = '([|]|^)universal[ ]|([|]|^)universal([|]|$)|T[-]{0,1}boy', | |
'Universal Music Group Nashville' = 'capitol[ ]{0,1}records[ ]{0,1}nashville|emi[ ]{0,1}records[ ]{0,1}nashville|mca[ ]{0,1}nashville|mercury[ ]{0,1}nashville|show[-]{0,1}dog', | |
'Universal Music Latin Entertainment' = 'capitol[ ]{0,1}latin|disa[ ]{0,1}records|fonovisa|machete[ ]{0,1}music|universal[ ]{0,1}music[ ]{0,1}latino', | |
'Verve Label Group' = 'verve[ ]label[ ]group|verve[/]|[/]verve|verve[ ]{0,1}records|decca[ ]{0,1}gold|universal[ ]{0,1}music[ ]{0,1}classics|decca[ ]{0,1}broadway|verve[ ]{0,1}group|([|]|&)verve([|]|$)|verve[ ]{0,1}music[ ]{0,1}group', | |
'PM:AM Recordings' = 'PM[:]AM|pm[ ]{0,1}am', | |
'Spinefarm Records' = 'spinefarm', | |
'SpinnUp' = 'SpinnUp', | |
'Disques Barclay' = 'disques[ ]{0,1}barclay|([|]|^)barclay', | |
'Varese Sarabande' = 'var.se', | |
'Digital Distribution Trinidad and Tobago' ='Digital Distribution Trinidad and Tobago', | |
'Universal Music (combined)' = 'varese[ ]sarabande|disques[ ]barclay|spinnup|spinefarm[ ]records|pm:am[ ]recordings|verve[ ]label[ ]group|universal[ ]music[ ]latin[ ]entertainment|universal[ ]music[ ]group[ ]nashville|universal[ ]music[ ]enterprises|republic[ ]records|polydor[ ]records|island[ ]records|interscope|eagle[ ]rock[ ]entertainment|deutsche[ ]grammophon|def[ ]jam[ ]recordings|decca[ ]classics|capitol[ ]music[ ]group|universal[ ]music[ ]group', | |
'Abbey Road' = 'abbey[ ]road|bravad', | |
'others' = 'xo records|young money|shady records|spinefarm rec|u[-]live', | |
'UMe' = '([|]|^)UMe[ ]|Avenue Records/UMe|Alpha Dog 2T/UMe|([|]|^)UMe([|]|$)') | |
labels_sony = c('Columbia Records'='CBS[ ]columbia|([|]|^)columbia|hypnotize[ ]minds|[/]columbia|columbia[/]', | |
'Columbia Records 2' = 'dreamville[ ]entertainment|small.*giant|startime[ ]international|blue[ ]propaganda', | |
'RCA Records' = '([|]|^)rca|([|]|^)bystorm.*entertainment|([|]|^)nappy[ ]boy|([|]|^)j[ ]records', | |
'Epic Records' = '([|]|^)epic|([|]|^)battery|([|]|^)freebandz|([|]|^)bad[ ]{0,1}boy[ ]records|([|]|^)volcano|vested[ ]in[ ]culture', | |
'Sony Music Nashville' = 'sony[ ]music|([|]|^)arista|([|]|^)columbia[ ]nashville|rca[ ]records[ ]nashville', | |
'Zomba Music Group' = '([|]|^)zomba|([|]|^)jive[ ]records|([|]|^)verity|([|]|^)silvertone', | |
'RED Music Distribution' = '([|]|^)red[ ]music[ ]|odd[ ]future|([|]|^)red[ ]ink|cinematic[ ]music|([|]|^)reach[ ]records', | |
'Legacy Recordings' = '([|]|^)legacy[ ]recordings|([|]|^)laface', | |
'Sony Music Latin' = 'sony.*latin', | |
'Ariola Records' = 'ariola', | |
'Sony Masterworks' = 'sony[ ]masterworks|([|]|^)bluebird|([|]|^)okeh|portrait[ ]records|([|]|^)portrait|([|]|^)arte[ ]nova|sony[ ]classical|flying[ ]buddha|([|]|^)masterworks', | |
'Provident Label Group' = '([|]|^)provident|essential[ ]records|flicker[ ]records|beach[ ]street|reunion[ ]records|essential[ ]worship', | |
'Century Media Records' = 'century[ ]media|([|]|^)century record|people[ ]like[ ]you|insideout[ ]music|superball[ ]music', | |
'Sony Music Entertainment' = 'Sony[ ]BMG|([|]|^)BMG|columbia[ ]music|sony[ ]music|Columbia[ ]records|RCA[ ]Records|Epic[ ]Records|Sony[ ]Music[ ]Nashville|Zomba[ ]Music[ ]Group|RED[ ]Music[ ]Distribution[ ]|Legacy[ ]Recordings|Sony[ ]Music[ ]Latin|Ariola[ ]Records|Sony[ ]Masterworks|Provident[ ]Label[ ]Group|Century[ ]Media[ ]Records', | |
'sony/ATV music publishing' = 'Sony/ATV|Sony ATV', | |
'Filtr' = '([|]|^)Filtr([|]|$)|([|]|^)Filtr Kids', | |
'Ultra records (llc)' = '([|]|^)Ultra Records|([|]|^)"Ultra Records', | |
'disco:wax' = 'disco:wax') | |
label_iter=list(warner=labels_warner, universal=labels_universal, sony=labels_sony) | |
labels$label_classified <- "" | |
for (lbl in names(label_iter)) { | |
for (l in label_iter[[lbl]]) { | |
labels[grepl(l, labels$label, ignore.case=TRUE),]$label_classified = lbl | |
} | |
} | |
write.table(labels, "labels-classified_edit.csv", row.names=F, sep='\t') | |
sum(labels$label_classified=='sony') | |
sum(labels$label_classified=='universal') | |
sum(labels$label_classified=='warner') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hi! i've done some changes to the script. will you please integrate them? See: https://gist.github.com/hannesdatta-research/aac9b1ddf8e69794f3f402292cd76b25