Skip to content

Instantly share code, notes, and snippets.

@jlisic
Created January 19, 2021 03:50
Show Gist options
  • Save jlisic/4bda28ebe4cf0c10638f685c84aaba5e to your computer and use it in GitHub Desktop.
Save jlisic/4bda28ebe4cf0c10638f685c84aaba5e to your computer and use it in GitHub Desktop.
A quick R script to convert a 538 chat to an mp3 file using voices in MacOS. Requires ffmpeg, you would also need to update some of the file locations.
library('rvest')
library('dplyr')
library('magrittr')
setwd('~/src/538/')
# this is the blog post we are interested in
#URL <- "https://fivethirtyeight.com/features/who-killed-the-health-care-bill/"
#URL <- "https://fivethirtyeight.com/features/which-gop-senators-might-resurrect-the-health-care-bill/"
URL <- "https://fivethirtyeight.com/features/how-has-the-radical-right-evolved-under-trump/"
episode <- last(strsplit(URL,'/')[[1]])
#######################################
# get the blog post
#######################################
blog_538 <- read_html(URL)
#######################################
# get politics talk
#######################################
politics_talk <- blog_538 %>% html_nodes("#content") %>% html_nodes("p") %>% html_text()
#######################################
# first index is date and time
# second index is "A FiveThirtyEightChat"
# third index is "Filed under Health Care"
# fourth is the intro, it may actually be multiple lines, neh
#######################################
#######################################
# we will terminate on "Filed under"
#######################################
people_names <- politics_talk[sapply( politics_talk, function(x) grepl("[)]:",x) )]
people_names <- sapply( politics_talk, function(x) strsplit(x,"):")[[1]][1] )
people_names <- people_names[sapply( people_names, function(x) grepl("^[a-zA-Z]([a-zA-Z]|[.]|[-]|[_]|[:])* [(]",x,ignore.case=TRUE))]
names(people_names) <- NULL
#######################################
# get short names
#######################################
short_names = sapply( people_names, function(x) strsplit(x," ")[[1]][1] )
#######################################
# place holder for reader
#######################################
politics_talk_reader <- rep(NA, length=length(politics_talk))
for( short_name in short_names) {
politics_talk_reader[ grepl( sprintf("^%s",short_name),politics_talk) ] <- short_name
}
#######################################
# identify blanks
#######################################
politics_talk_reader[ sapply(politics_talk, nchar) == 0 ] <- ""
#######################################
# remove informational lines at the end
#######################################
wait_to_na = FALSE
for( i in length(politics_talk):1) {
print(i)
if( tolower("filed under") == tolower(politics_talk[[i]]) ) {
print('found')
wait_to_na = TRUE
}
if( wait_to_na ) {
if( nchar(politics_talk[[i]]) == 0 ) {
break
}
}
politics_talk_reader[i] <- ""
}
#######################################
# pull down names
#######################################
cur_name = ""
for( i in 1:length(politics_talk)) {
if(is.na(politics_talk_reader[i])) {
politics_talk_reader[i] <- cur_name
} else if ( nchar(politics_talk_reader[i]) > 0 ) {
cur_name = politics_talk_reader[i]
}
}
politicas_talk_reader[4] <- 'narrator'
politics_talk_df <- data.frame( short_names = politics_talk_reader, stringsAsFactors = FALSE)
print(politics_talk_df)
#######################################
# assign voices to people
#######################################
mac_names <- c(
'Alex', #en_US # Most people recognize me by my voice.
'Daniel', #en_GB # Hello, my name is Daniel. I am a British-English voice.
'Fiona', #en-scotland # Hello, my name is Fiona. I am a Scottish-English voice.
'Fred', #en_US # I sure like being inside this fancy computer
'Karen', #en_AU # Hello, my name is Karen. I am an Australian-English voice.
'Moira', #en_IE # Hello, my name is Moira. I am an Irish-English voice.
'Rishi', #en_IN # Hello, my name is Rishi. I am an Indian-English voice.
'Samantha', #en_US # Hello, my name is Samantha. I am an American-English voice.
'Tessa', #en_ZA # Hello, my name is Tessa. I am a South African-English voice.
'Veena', #en_IN # Hello, my name is Veena. I am an Indian-English voice.
'Victoria' #en_US # Isn't it nice to have a computer that will talk to you?
)
# create data frame
name_df <- data.frame(
full_names=c(sprintf("%s)",names(short_names)),'narrator'),
short_names=c(short_names,'narrator'),
stringsAsFactors = FALSE)
name_df$mac_names <- sample( mac_names, size=NROW(name_df), replace = FALSE )
rownames(name_df) <-NULL
# record to aiff with play
reader_df <- left_join( politics_talk_df, name_df, by="short_names")
success <- c()
for( i in 1:NROW(reader_df)){
if( !is.na(politics_talk[[i]]) & !is.na(reader_df$mac_names[i]) ) {
# write audio
con <- file(sprintf('tmp/tmp%d.txt',i),'w')
writeLines(politics_talk[[i]], con=con)
close(con)
success <- c(success,i)
system(sprintf("say -v %s -f tmp/tmp%d.txt -o tmp/out_%d.aiff", reader_df$mac_names[i], i, i))
unlink(sprintf('tmp/tmp%d.txt',i))
}
}
con <- file(sprintf('tmp/all_files.txt'),'w')
writeLines(sprintf("file out_%d.aiff",success), con=con)
close(con)
system( "ffmpeg -f concat -i tmp/all_files.txt -c copy output.aiff")
system( sprintf("ffmpeg -i output.aiff %s.mp3 -y", episode))
unlink('output.aiff')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment