Created
January 19, 2021 03:50
-
-
Save jlisic/4bda28ebe4cf0c10638f685c84aaba5e to your computer and use it in GitHub Desktop.
A quick R script to convert a 538 chat to an mp3 file using voices in MacOS. Requires ffmpeg, you would also need to update some of the file locations.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library('rvest') | |
library('dplyr') | |
library('magrittr') | |
setwd('~/src/538/') | |
# this is the blog post we are interested in | |
#URL <- "https://fivethirtyeight.com/features/who-killed-the-health-care-bill/" | |
#URL <- "https://fivethirtyeight.com/features/which-gop-senators-might-resurrect-the-health-care-bill/" | |
URL <- "https://fivethirtyeight.com/features/how-has-the-radical-right-evolved-under-trump/" | |
episode <- last(strsplit(URL,'/')[[1]]) | |
####################################### | |
# get the blog post | |
####################################### | |
blog_538 <- read_html(URL) | |
####################################### | |
# get politics talk | |
####################################### | |
politics_talk <- blog_538 %>% html_nodes("#content") %>% html_nodes("p") %>% html_text() | |
####################################### | |
# first index is date and time | |
# second index is "A FiveThirtyEightChat" | |
# third index is "Filed under Health Care" | |
# fourth is the intro, it may actually be multiple lines, neh | |
####################################### | |
####################################### | |
# we will terminate on "Filed under" | |
####################################### | |
people_names <- politics_talk[sapply( politics_talk, function(x) grepl("[)]:",x) )] | |
people_names <- sapply( politics_talk, function(x) strsplit(x,"):")[[1]][1] ) | |
people_names <- people_names[sapply( people_names, function(x) grepl("^[a-zA-Z]([a-zA-Z]|[.]|[-]|[_]|[:])* [(]",x,ignore.case=TRUE))] | |
names(people_names) <- NULL | |
####################################### | |
# get short names | |
####################################### | |
short_names = sapply( people_names, function(x) strsplit(x," ")[[1]][1] ) | |
####################################### | |
# place holder for reader | |
####################################### | |
politics_talk_reader <- rep(NA, length=length(politics_talk)) | |
for( short_name in short_names) { | |
politics_talk_reader[ grepl( sprintf("^%s",short_name),politics_talk) ] <- short_name | |
} | |
####################################### | |
# identify blanks | |
####################################### | |
politics_talk_reader[ sapply(politics_talk, nchar) == 0 ] <- "" | |
####################################### | |
# remove informational lines at the end | |
####################################### | |
wait_to_na = FALSE | |
for( i in length(politics_talk):1) { | |
print(i) | |
if( tolower("filed under") == tolower(politics_talk[[i]]) ) { | |
print('found') | |
wait_to_na = TRUE | |
} | |
if( wait_to_na ) { | |
if( nchar(politics_talk[[i]]) == 0 ) { | |
break | |
} | |
} | |
politics_talk_reader[i] <- "" | |
} | |
####################################### | |
# pull down names | |
####################################### | |
cur_name = "" | |
for( i in 1:length(politics_talk)) { | |
if(is.na(politics_talk_reader[i])) { | |
politics_talk_reader[i] <- cur_name | |
} else if ( nchar(politics_talk_reader[i]) > 0 ) { | |
cur_name = politics_talk_reader[i] | |
} | |
} | |
politicas_talk_reader[4] <- 'narrator' | |
politics_talk_df <- data.frame( short_names = politics_talk_reader, stringsAsFactors = FALSE) | |
print(politics_talk_df) | |
####################################### | |
# assign voices to people | |
####################################### | |
mac_names <- c( | |
'Alex', #en_US # Most people recognize me by my voice. | |
'Daniel', #en_GB # Hello, my name is Daniel. I am a British-English voice. | |
'Fiona', #en-scotland # Hello, my name is Fiona. I am a Scottish-English voice. | |
'Fred', #en_US # I sure like being inside this fancy computer | |
'Karen', #en_AU # Hello, my name is Karen. I am an Australian-English voice. | |
'Moira', #en_IE # Hello, my name is Moira. I am an Irish-English voice. | |
'Rishi', #en_IN # Hello, my name is Rishi. I am an Indian-English voice. | |
'Samantha', #en_US # Hello, my name is Samantha. I am an American-English voice. | |
'Tessa', #en_ZA # Hello, my name is Tessa. I am a South African-English voice. | |
'Veena', #en_IN # Hello, my name is Veena. I am an Indian-English voice. | |
'Victoria' #en_US # Isn't it nice to have a computer that will talk to you? | |
) | |
# create data frame | |
name_df <- data.frame( | |
full_names=c(sprintf("%s)",names(short_names)),'narrator'), | |
short_names=c(short_names,'narrator'), | |
stringsAsFactors = FALSE) | |
name_df$mac_names <- sample( mac_names, size=NROW(name_df), replace = FALSE ) | |
rownames(name_df) <-NULL | |
# record to aiff with play | |
reader_df <- left_join( politics_talk_df, name_df, by="short_names") | |
success <- c() | |
for( i in 1:NROW(reader_df)){ | |
if( !is.na(politics_talk[[i]]) & !is.na(reader_df$mac_names[i]) ) { | |
# write audio | |
con <- file(sprintf('tmp/tmp%d.txt',i),'w') | |
writeLines(politics_talk[[i]], con=con) | |
close(con) | |
success <- c(success,i) | |
system(sprintf("say -v %s -f tmp/tmp%d.txt -o tmp/out_%d.aiff", reader_df$mac_names[i], i, i)) | |
unlink(sprintf('tmp/tmp%d.txt',i)) | |
} | |
} | |
con <- file(sprintf('tmp/all_files.txt'),'w') | |
writeLines(sprintf("file out_%d.aiff",success), con=con) | |
close(con) | |
system( "ffmpeg -f concat -i tmp/all_files.txt -c copy output.aiff") | |
system( sprintf("ffmpeg -i output.aiff %s.mp3 -y", episode)) | |
unlink('output.aiff') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment