jlisic · January 19, 2021 03:50
diff --git a/538_pseudo_cast.R b/538_pseudo_cast.R
 library('rvest')
 library('dplyr')
 library('magrittr')

 setwd('~/src/538/')

 # this is the blog post we are interested in
 #URL <- "https://fivethirtyeight.com/features/who-killed-the-health-care-bill/"
 #URL <- "https://fivethirtyeight.com/features/which-gop-senators-might-resurrect-the-health-care-bill/"
 URL <- "https://fivethirtyeight.com/features/how-has-the-radical-right-evolved-under-trump/"

 episode <- last(strsplit(URL,'/')[[1]])

 #######################################
 # get the blog post
 #######################################
 blog_538 <- read_html(URL)

 #######################################
 # get politics talk
 #######################################
 politics_talk <- blog_538 %>% html_nodes("#content") %>% html_nodes("p") %>% html_text() 

 #######################################
 # first index is date and time
 # second index is "A FiveThirtyEightChat"
 # third index is "Filed under Health Care"
 # fourth is the intro, it may actually be multiple lines, neh
 #######################################

 #######################################
 # we will terminate on "Filed under"
 #######################################

 people_names <- politics_talk[sapply(  politics_talk,  function(x) grepl("[)]:",x) )]
 people_names <- sapply(  politics_talk,  function(x) strsplit(x,"):")[[1]][1] )

 people_names <- people_names[sapply( people_names, function(x) grepl("^[a-zA-Z]([a-zA-Z]|[.]|[-]|[_]|[:])* [(]",x,ignore.case=TRUE))]
 names(people_names) <- NULL

 #######################################
 # get short names
 #######################################
 short_names = sapply( people_names, function(x) strsplit(x," ")[[1]][1] )

 #######################################
 # place holder for reader
 #######################################
 politics_talk_reader <- rep(NA, length=length(politics_talk))
 
 for( short_name in short_names) {
  politics_talk_reader[ grepl( sprintf("^%s",short_name),politics_talk) ] <- short_name
 } 

 #######################################
 # identify blanks  
 #######################################
 politics_talk_reader[ sapply(politics_talk, nchar) == 0 ] <- ""

 #######################################
 # remove informational lines at the end 
 #######################################
 wait_to_na = FALSE
 for( i in length(politics_talk):1) {
  print(i)
  if( tolower("filed under") == tolower(politics_talk[[i]]) ) {
    print('found')
    wait_to_na = TRUE
  }
  if( wait_to_na ) {
    if( nchar(politics_talk[[i]]) == 0 )  {
      break
    }
  }
  
  politics_talk_reader[i] <- ""
 }




 #######################################
 # pull down names
 #######################################
 cur_name = ""
 for( i in 1:length(politics_talk)) {
 
  if(is.na(politics_talk_reader[i])) {
    politics_talk_reader[i] <- cur_name
  } else if ( nchar(politics_talk_reader[i]) > 0 ) {
    cur_name = politics_talk_reader[i]
  }
  
 }

 politicas_talk_reader[4] <- 'narrator'

 politics_talk_df <- data.frame( short_names = politics_talk_reader, stringsAsFactors = FALSE)
 print(politics_talk_df)

 #######################################
 # assign voices to people
 #######################################
 mac_names <- c(
  'Alex',                #en_US    # Most people recognize me by my voice.
  'Daniel',              #en_GB    # Hello, my name is Daniel. I am a British-English voice.
  'Fiona',               #en-scotland # Hello, my name is Fiona. I am a Scottish-English voice.
  'Fred',                #en_US    # I sure like being inside this fancy computer
  'Karen',               #en_AU    # Hello, my name is Karen. I am an Australian-English voice.
  'Moira',               #en_IE    # Hello, my name is Moira. I am an Irish-English voice.
  'Rishi',               #en_IN    # Hello, my name is Rishi. I am an Indian-English voice.
  'Samantha',            #en_US    # Hello, my name is Samantha. I am an American-English voice.
  'Tessa',               #en_ZA    # Hello, my name is Tessa. I am a South African-English voice.
  'Veena',               #en_IN    # Hello, my name is Veena. I am an Indian-English voice.
  'Victoria'             #en_US    # Isn't it nice to have a computer that will talk to you?
 )

 # create data frame
 name_df <- data.frame(
 full_names=c(sprintf("%s)",names(short_names)),'narrator'),
 short_names=c(short_names,'narrator'),
 stringsAsFactors = FALSE)

 name_df$mac_names <- sample( mac_names, size=NROW(name_df), replace = FALSE )
 rownames(name_df) <-NULL


 # record to aiff with play
 reader_df <- left_join( politics_talk_df, name_df, by="short_names")

 success <- c()
 for( i in 1:NROW(reader_df)){
  if( !is.na(politics_talk[[i]]) & !is.na(reader_df$mac_names[i]) ) {
    # write audio
    con <- file(sprintf('tmp/tmp%d.txt',i),'w')
    writeLines(politics_talk[[i]], con=con)
    close(con)

    success <- c(success,i)
    
    system(sprintf("say -v %s -f tmp/tmp%d.txt -o tmp/out_%d.aiff", reader_df$mac_names[i], i, i))
    unlink(sprintf('tmp/tmp%d.txt',i))
  }
 }


 con <- file(sprintf('tmp/all_files.txt'),'w')
 writeLines(sprintf("file out_%d.aiff",success), con=con)
 close(con)

 system( "ffmpeg -f concat -i tmp/all_files.txt -c copy output.aiff")
 system( sprintf("ffmpeg -i output.aiff %s.mp3 -y", episode))
 unlink('output.aiff')
	library('rvest')
	library('dplyr')
	library('magrittr')

	setwd('~/src/538/')

	# this is the blog post we are interested in
	#URL <- "https://fivethirtyeight.com/features/who-killed-the-health-care-bill/"
	#URL <- "https://fivethirtyeight.com/features/which-gop-senators-might-resurrect-the-health-care-bill/"
	URL <- "https://fivethirtyeight.com/features/how-has-the-radical-right-evolved-under-trump/"

	episode <- last(strsplit(URL,'/')[[1]])

	#######################################
	# get the blog post
	#######################################
	blog_538 <- read_html(URL)

	#######################################
	# get politics talk
	#######################################
	politics_talk <- blog_538 %>% html_nodes("#content") %>% html_nodes("p") %>% html_text()

	#######################################
	# first index is date and time
	# second index is "A FiveThirtyEightChat"
	# third index is "Filed under Health Care"
	# fourth is the intro, it may actually be multiple lines, neh
	#######################################

	#######################################
	# we will terminate on "Filed under"
	#######################################

	people_names <- politics_talk[sapply( politics_talk, function(x) grepl("[)]:",x) )]
	people_names <- sapply( politics_talk, function(x) strsplit(x,"):")[[1]][1] )

	people_names <- people_names[sapply( people_names, function(x) grepl("^[a-zA-Z]([a-zA-Z]\|[.]\|[-]\|[_]\|[:])* [(]",x,ignore.case=TRUE))]
	names(people_names) <- NULL

	#######################################
	# get short names
	#######################################
	short_names = sapply( people_names, function(x) strsplit(x," ")[[1]][1] )

	#######################################
	# place holder for reader
	#######################################
	politics_talk_reader <- rep(NA, length=length(politics_talk))

	for( short_name in short_names) {
	politics_talk_reader[ grepl( sprintf("^%s",short_name),politics_talk) ] <- short_name
	}

	#######################################
	# identify blanks
	#######################################
	politics_talk_reader[ sapply(politics_talk, nchar) == 0 ] <- ""

	#######################################
	# remove informational lines at the end
	#######################################
	wait_to_na = FALSE
	for( i in length(politics_talk):1) {
	print(i)
	if( tolower("filed under") == tolower(politics_talk[[i]]) ) {
	print('found')
	wait_to_na = TRUE
	}
	if( wait_to_na ) {
	if( nchar(politics_talk[[i]]) == 0 ) {
	break
	}
	}

	politics_talk_reader[i] <- ""
	}




	#######################################
	# pull down names
	#######################################
	cur_name = ""
	for( i in 1:length(politics_talk)) {

	if(is.na(politics_talk_reader[i])) {
	politics_talk_reader[i] <- cur_name
	} else if ( nchar(politics_talk_reader[i]) > 0 ) {
	cur_name = politics_talk_reader[i]
	}

	}

	politicas_talk_reader[4] <- 'narrator'

	politics_talk_df <- data.frame( short_names = politics_talk_reader, stringsAsFactors = FALSE)
	print(politics_talk_df)

	#######################################
	# assign voices to people
	#######################################
	mac_names <- c(
	'Alex', #en_US # Most people recognize me by my voice.
	'Daniel', #en_GB # Hello, my name is Daniel. I am a British-English voice.
	'Fiona', #en-scotland # Hello, my name is Fiona. I am a Scottish-English voice.
	'Fred', #en_US # I sure like being inside this fancy computer
	'Karen', #en_AU # Hello, my name is Karen. I am an Australian-English voice.
	'Moira', #en_IE # Hello, my name is Moira. I am an Irish-English voice.
	'Rishi', #en_IN # Hello, my name is Rishi. I am an Indian-English voice.
	'Samantha', #en_US # Hello, my name is Samantha. I am an American-English voice.
	'Tessa', #en_ZA # Hello, my name is Tessa. I am a South African-English voice.
	'Veena', #en_IN # Hello, my name is Veena. I am an Indian-English voice.
	'Victoria' #en_US # Isn't it nice to have a computer that will talk to you?
	)

	# create data frame
	name_df <- data.frame(
	full_names=c(sprintf("%s)",names(short_names)),'narrator'),
	short_names=c(short_names,'narrator'),
	stringsAsFactors = FALSE)

	name_df$mac_names <- sample( mac_names, size=NROW(name_df), replace = FALSE )
	rownames(name_df) <-NULL


	# record to aiff with play
	reader_df <- left_join( politics_talk_df, name_df, by="short_names")

	success <- c()
	for( i in 1:NROW(reader_df)){
	if( !is.na(politics_talk[[i]]) & !is.na(reader_df$mac_names[i]) ) {
	# write audio
	con <- file(sprintf('tmp/tmp%d.txt',i),'w')
	writeLines(politics_talk[[i]], con=con)
	close(con)

	success <- c(success,i)

	system(sprintf("say -v %s -f tmp/tmp%d.txt -o tmp/out_%d.aiff", reader_df$mac_names[i], i, i))
	unlink(sprintf('tmp/tmp%d.txt',i))
	}
	}


	con <- file(sprintf('tmp/all_files.txt'),'w')
	writeLines(sprintf("file out_%d.aiff",success), con=con)
	close(con)

	system( "ffmpeg -f concat -i tmp/all_files.txt -c copy output.aiff")
	system( sprintf("ffmpeg -i output.aiff %s.mp3 -y", episode))
	unlink('output.aiff')