Created
March 24, 2020 15:56
-
-
Save hannesdatta/aa26a5feecc75bc6a6f43d62117d38ff to your computer and use it in GitHub Desktop.
clean clear-text artist names from collaborations and secondary artists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(stringi) | |
spelling_variants <- function(x, remove_collabs=F, remove_parentheses=T) { | |
qualifiers = c(" feat .*", " feat[.].*", " ft.*", " ft[.].*"," featuring.*"," vs[.].*"," vs.*"," versus.*"," with.*","[-].*"," / .*", | |
"/.*","[|].*", "[[].*[]]", "[)].*", ";.*","[+].*","[&] .*","[&].*",",.*"," and .*", " con .*", " e .*", " et .*", | |
" x .*") | |
# remove articles (a, the) | |
ret = gsub(" a ", "", tolower(str_trim(x))) | |
removes = c("the ", # article | |
"^[(][[:digit:]]+[)]", # digits wrapped in parentheses at beginning of string | |
"^[#][[:digit:]]{1,5}[[:punct:]|[:space:]]", # digits lead by # at beginning of string | |
"^[0][[:digit:]]{1,2}[[:punct:]|[:space:]]", # digits lead by 0 at beginning of string | |
"^[[:digit:]]{1,3}[[:punct:]|[:space:]]", # digits at the beginning of a string if they are followed by a punctuation mark (e.g., 12 - Hello keeps Hello) | |
"[[:digit:]]{2}[[:punct:]][[:digit:]]{2}[[:punct:]][[:digit:]]{2,4}") # date-type objects | |
for (rem in removes) ret=str_trim(gsub(rem,'',ret)) | |
# Remove dash if occuring within first three characters (e.g., "JAY-Z") | |
ret = gsub('(^[a-z]{0,3})([-])', '\\1', ret) | |
# Remove special characters (e.g., - Jodeli -) at the beginning and end of a string | |
for (i in 1:5) { | |
first_char = substr(ret,1,1) | |
last_char = substr(stri_reverse(ret),1,1) | |
eval_condition = grepl('[[:punct:]]{2}', paste0(first_char,last_char)) | |
start_char = ifelse(eval_condition==T, 2,1) | |
end_char = ifelse(eval_condition==T, nchar(ret)-1,nchar(ret)) | |
ret=str_trim(substr(ret,start_char,end_char)) | |
} | |
if(remove_parentheses==T) { | |
# Remove content from parentheses, unless it's at the beginning of a string | |
ret = gsub('(?<!^)[(].*','', ret,perl=T) | |
} | |
# Remove dash/+/[ at beginning of string | |
ret = gsub('^[-]|^[+]|^[[]','', ret) | |
if(remove_collabs==T) { | |
# remove collaborations | |
for (qual in qualifiers) ret=sub(qual, "", str_trim(ret)) | |
} | |
ret = str_trim(sub("30 ", "thirty ", ret)) | |
ret = str_trim(gsub("[^0-9A-Za-z ]", "", ret)) | |
for (i in 1:5) ret = str_trim(gsub(" ", " ", ret)) # remove white space inbetween characters | |
return(ret) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment