Skip to content

Instantly share code, notes, and snippets.

@hannesdatta
Last active July 23, 2020 06:05
Show Gist options
  • Save hannesdatta/19508cba3ab80bf0b2a648bec2480d0e to your computer and use it in GitHub Desktop.
Save hannesdatta/19508cba3ab80bf0b2a648bec2480d0e to your computer and use it in GitHub Desktop.
Classifying music labels into major- and independent labels
This gist has been replaced by an R package with an updated list of labels.
Get it on GitHub: https://github.com/hannesdatta/musicMetadata
LEGACY CODE
#################################################
# #
# Classify music labels #
# into major labels (Sony, Warner, Universal), #
# and independent labels (all others) #
# #
# Maintained by: #
# [email protected] #
# #
# COMMENTS AND FEEDBACK WELCOME! #
# #
# Acknowledgements: #
# Thanks to Robbert Oudelaar #
# (https://tinyurl.com/r36u6ly) for debugging #
# #
# #
# #
# #
#################################################
# In the script below, I use a series of
# regular expressions (for an extensive tutorial,
# see https://www.hackerearth.com/practice/machine-learning/advanced-techniques/regular-expressions-string-manipulation-r/tutorial/)
# to identify major labels (versus independent
# labels in a list of label names (as available on
# Spotify.
# For a guide how to contribiute to this Gist, see http://tilburgsciencehub.com/workflow/collaboration/
labels = read.table(unz('labels.zip', 'labels.csv'), header = T, quote="", encoding = "UTF-8", sep="\t")
labels_warner = c('Warner Music' = 'warner[ ]music|warner[ ]records|warner[ ]home|warner[ ]special|warner[ ]strategic|warner[.]esp|Warner[ ][A-Z]',
'Asylum Records' = '^asylum$|asylum[/]|[/]asylum|asylum[ ]records|([|]|^)atlantic[ ]records|elektra asylum|atlantic[ ]|([|]|^)atlantic([|]|$)|elektra[ ]records|[/]elektra|elektra[/]|([|]|^)elektra([|]|$)|warner[ ]music[ ]nashville|warner[ ]bros|elektra[ ]nashville',
'Big Beats Records' = 'big[ ]beat[ ]|([|]|^)big beat([|]|$)|[/]big beat|big beat[/]|big beat$',
'Canvasback Music' = 'canvasback',
'Parlophone Label Group' = 'parlophone|FFR[ ]records|([|]|^)FFRR([|]|$)|virgin[ ]classics|emi[ ]classics|[ ]erato[ ]|([|]|^)erato|warner[ ]classics|([|]|^)erato[ ]|[/]erato|erato[/]',
'Reprise Records' = '([|]|^)reprise[ ]|[/]reprise|reprise[/]',
'Fueled By Ramen' = 'Fueled[ ]by[ ]ramen',
'Nonesuch Records' = 'nonesuch[ ]records|([|]|^)nonesuch([|]|$)|[/]nonesuch|nonesuch[/]',
'Rhino Entertainment' = 'rhino[ ]entertainment|([|]|^)rhino|[/]rhino|rhino[/]',
'Roadrunner Records' = '([|]|^)roadrunner|[/]roadrunner|roadrunner[/]',
'Sire Records' = '([|]|^)sire[ ]records|([|]|^)sire[ ]|([|]|^)sire[ ]([|]|$)|[/]sire|sire[/]',
'East West' = 'east[ ]west|eastwest',
'Warner (all combined)' = '([|]|^)warner|[(]warner[)]|asylum[ ]records|big[ ]beat[ ]records|canvasback[ ]music|parlophone[ ]label[ ]group|reprise[ ]records|fueled[ ]by[ ]ramen|nonesuch[ ]records|rhino[ ]entertainment|roadrunner[ ]records|sire[ ]records|east[ ]west',
'WM' = 'WM[ ]|WM[/]|[/]WMI|[ ]WMI|WMI[ ]')
labels_universal = c('Universal Music Group' = '([|]|^)universal|[/]universal|universal[/]|([|]|^)universal[ ]music[ ]japan|([|]|^)universal[ ]sigma|([|]|^)universal[ ]international|([|]|^)geneon[ ]universal|nbcuniversal|universal[ ]licensing[ ]music|([|]|^)universal[ ]music[ ]|universal[ ]music[ ]spain|universal[ ]m..z.k|([|]|^)universal records|([|]|^)universal[ ]records[ ]|([|]|^)universal[ ]republic[ ]records',
'Capitol Music Group' = 'capitol|astralwerks|blue[ ]{0,2}note|([|]|^)caroline[ ]|deep[ ]{0,2}well|([|]|^)harvest|([|]|^)metamorphosis|motown|quality[ ]{0,2}control|([|]|^)virgin[ ]|([|]|^)virgin([|]|$)|[/]virgin|virgin[/]',
'Decca Classics' = 'decca|ecm([ ]|$|[|])|([|]|^)mercury|([|]|^)mercury[ ]classics|([|]|^)mercury[ ]records|mercury[ ]records',
'Def Jam Recordings' = 'def[ ]{0,2}jam|artium|g.o.o.d|([|]|^)good([|]|$)|good[ ]records',
'Deutsche Grammophon' = 'deutsche[ ]grammophon|grammophon',
'Eagle Rock Entertainment' = 'eagle[ ]rock|eagle[ ]records',
#original EMI 'EMI' = 'emi[ ]|([|]|^)emi|([|]|^)emi[ ]music|emi[-]|[/]emi|emi[/]',
'EMI' = 'emi[ ]|([|]|^)emi[ ]music|[/]emi|emi[/]|[-]emi|([|]|^)emi([|]|$)|[ ]emi([|]|$)',
'Interscope' = 'interscope|geffen|A[&]M|([|]|^)222([|]|$)|aftermath|darkroom|dreamville|LVRN Records|Mad Love|insomniac[ ]|kidinakorner|shady[ ]{0,2}records|([|]|^)shady',
'Island Records' = '([|]|^)island[ ]records|4th & Broadway|universal[ ]island|([|]|^)island[ ]|([|]|^)island([|]|$)|[/]island|island[/]',
'Polydor Records' = '([|]|^)polydor[ ]|([|]|^)polydor[ ]([|]|$)|([|]|^)fiction[ ]records|([|]|^)fiction([|]|$)|polydor',
'Republic Records' = '([|]|^)republic[ ]records|[/]republic|republic[/]|universal[ ]republic|([|]|^)american[ ]recordings|([|]|^)Brushfire[ ]records|([|]|^)casablanca[ ]records|([|]|^)cash[ ]money[ ]records|john[ ]varvatos|([|]|^)lava[ ]records|lightworkers|([|]|^)the[ ]voice',
'Republic Records 2' = '([|]|^)republic[ ]records|([|]|^)american[ ]recordings|([|]|^)Brushfire[ ]records|([|]|^)casablanca[ ]records|([|]|^)cash[ ]money[ ]records|john[ ]varvatos|([|]|^)lava[ ]records|lightworkers|([|]|^)the[ ]voice',
'Universal Music Enterprises' = '([|]|^)universal[ ]|([|]|^)universal([|]|$)|T[-]{0,1}boy',
'Universal Music Group Nashville' = 'capitol[ ]{0,1}records[ ]{0,1}nashville|emi[ ]{0,1}records[ ]{0,1}nashville|mca[ ]{0,1}nashville|mercury[ ]{0,1}nashville|show[-]{0,1}dog',
'Universal Music Latin Entertainment' = 'capitol[ ]{0,1}latin|disa[ ]{0,1}records|fonovisa|machete[ ]{0,1}music|universal[ ]{0,1}music[ ]{0,1}latino',
'Verve Label Group' = 'verve[ ]label[ ]group|verve[/]|[/]verve|verve[ ]{0,1}records|decca[ ]{0,1}gold|universal[ ]{0,1}music[ ]{0,1}classics|decca[ ]{0,1}broadway|verve[ ]{0,1}group|([|]|&)verve([|]|$)|verve[ ]{0,1}music[ ]{0,1}group',
'PM:AM Recordings' = 'PM[:]AM|pm[ ]{0,1}am',
'Spinefarm Records' = 'spinefarm',
'SpinnUp' = 'SpinnUp',
'Disques Barclay' = 'disques[ ]{0,1}barclay|([|]|^)barclay',
'Varese Sarabande' = 'var.se',
'Digital Distribution Trinidad and Tobago' ='Digital Distribution Trinidad and Tobago',
'Universal Music (combined)' = 'varese[ ]sarabande|disques[ ]barclay|spinnup|spinefarm[ ]records|pm:am[ ]recordings|verve[ ]label[ ]group|universal[ ]music[ ]latin[ ]entertainment|universal[ ]music[ ]group[ ]nashville|universal[ ]music[ ]enterprises|republic[ ]records|polydor[ ]records|island[ ]records|interscope|eagle[ ]rock[ ]entertainment|deutsche[ ]grammophon|def[ ]jam[ ]recordings|decca[ ]classics|capitol[ ]music[ ]group|universal[ ]music[ ]group',
'Abbey Road' = 'abbey[ ]road|bravad',
'others' = 'xo records|young money|shady records|spinefarm rec|u[-]live',
'UMe' = '([|]|^)UMe[ ]|Avenue Records/UMe|Alpha Dog 2T/UMe|([|]|^)UMe([|]|$)')
labels_sony = c('Columbia Records'='CBS[ ]columbia|([|]|^)columbia|hypnotize[ ]minds|[/]columbia|columbia[/]',
'Columbia Records 2' = 'dreamville[ ]entertainment|small.*giant|startime[ ]international|blue[ ]propaganda',
'RCA Records' = '([|]|^)rca|([|]|^)bystorm.*entertainment|([|]|^)nappy[ ]boy|([|]|^)j[ ]records',
'Epic Records' = '([|]|^)epic|([|]|^)battery|([|]|^)freebandz|([|]|^)bad[ ]{0,1}boy[ ]records|([|]|^)volcano|vested[ ]in[ ]culture',
'Sony Music Nashville' = 'sony[ ]music|([|]|^)arista|([|]|^)columbia[ ]nashville|rca[ ]records[ ]nashville',
'Zomba Music Group' = '([|]|^)zomba|([|]|^)jive[ ]records|([|]|^)verity|([|]|^)silvertone',
'RED Music Distribution' = '([|]|^)red[ ]music[ ]|odd[ ]future|([|]|^)red[ ]ink|cinematic[ ]music|([|]|^)reach[ ]records',
'Legacy Recordings' = '([|]|^)legacy[ ]recordings|([|]|^)laface',
'Sony Music Latin' = 'sony.*latin',
'Ariola Records' = 'ariola',
'Sony Masterworks' = 'sony[ ]masterworks|([|]|^)bluebird|([|]|^)okeh|portrait[ ]records|([|]|^)portrait|([|]|^)arte[ ]nova|sony[ ]classical|flying[ ]buddha|([|]|^)masterworks',
'Provident Label Group' = '([|]|^)provident|essential[ ]records|flicker[ ]records|beach[ ]street|reunion[ ]records|essential[ ]worship',
'Century Media Records' = 'century[ ]media|([|]|^)century record|people[ ]like[ ]you|insideout[ ]music|superball[ ]music',
'Sony Music Entertainment' = 'Sony[ ]BMG|([|]|^)BMG|columbia[ ]music|sony[ ]music|Columbia[ ]records|RCA[ ]Records|Epic[ ]Records|Sony[ ]Music[ ]Nashville|Zomba[ ]Music[ ]Group|RED[ ]Music[ ]Distribution[ ]|Legacy[ ]Recordings|Sony[ ]Music[ ]Latin|Ariola[ ]Records|Sony[ ]Masterworks|Provident[ ]Label[ ]Group|Century[ ]Media[ ]Records',
'sony/ATV music publishing' = 'Sony/ATV|Sony ATV',
'Filtr' = '([|]|^)Filtr([|]|$)|([|]|^)Filtr Kids',
'Ultra records (llc)' = '([|]|^)Ultra Records|([|]|^)"Ultra Records',
'disco:wax' = 'disco:wax')
label_iter=list(warner=labels_warner, universal=labels_universal, sony=labels_sony)
labels$label_classified <- ""
for (lbl in names(label_iter)) {
for (l in label_iter[[lbl]]) {
labels[grepl(l, labels$label, ignore.case=TRUE),]$label_classified = lbl
}
}
write.table(labels, "labels-classified_edit.csv", row.names=F, sep='\t')
sum(labels$label_classified=='sony')
sum(labels$label_classified=='universal')
sum(labels$label_classified=='warner')
This file has been truncated, but you can view the full file.
@hannesdatta-research
Copy link

hi! i've done some changes to the script. will you please integrate them? See: https://gist.github.com/hannesdatta-research/aac9b1ddf8e69794f3f402292cd76b25

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment