Francisco Lima monogenea

Data scientist and blogger

monogenea / 1-poissonGoT.R

Last active October 7, 2019 15:52

	#!/Library/Frameworks/R.framework/Resources/Rscript
	# Mon Apr 15 18:41:47 2019 ------------------------------
	library(rtweet)

	# Twitter API
	create_token(app = "INSERT_HERE",
	consumer_key = "INSERT_HERE",
	consumer_secret = "INSERT_HERE",
	access_token = "INSERT_HERE",
	access_secret = "INSERT_HERE")

monogenea / 2-poissonGoT.R

Created October 7, 2019 15:48

	# Google Maps API https://developers.google.com/maps/documentation/javascript/get-api-key
	apiKey <- "INSERT_HERE"

monogenea / 3-poissonGoT.R

Created October 7, 2019 15:54

	# Read GOT tweets from US
	newTweets <- search_tweets(q = "game of thrones",
	retryonratelimit = T, lang = "en",
	geocode = lookup_coords("usa", apikey = apiKey),
	include_rts = FALSE, n = 1e5) # 1st day 3e5, to go back ~1 week

	# Specify dir
	dirPath <- "~/Documents/INSERT_PATH"

	# Create dir for storage

monogenea / 4-poissonGoT.R

Created October 7, 2019 15:55

	# Wed May 8 21:22:45 2019 ------------------------------
	# Use status_id to identify and exclude duplicates
	library(rtweet)

	# List all files
	allFiles <- paste0("tweets/", list.files("tweets/"))

	# Write function to merge tweets
	mergeTweets <- function(recipient, donor){
	idx <- !donor$status_id %in% recipient$status_id

monogenea / 5-poissonGoT.R

Created October 7, 2019 15:57

monogenea / 6-poissonGoT.R

Created October 7, 2019 15:58

	# Convert UTC to EDT
	allTweets %<>% dplyr::mutate(created_at = as_datetime(created_at, tz = "UTC")) %>%
	dplyr::mutate(created_at = with_tz(created_at, tzone = "America/New_York"))

	# Produce lat and lng coordinates
	allTweets <- lat_lng(allTweets)
	# Plot
	par(mar = rep(12, 4))
	map("state", lwd = .25)
	# plot lat and lng points onto state map

monogenea / 7-poissonGoT.R

Created October 7, 2019 16:00

monogenea / 8-poissonGoT.R

Created October 7, 2019 16:01

	# Identify tweets containing any of the characters names (0/1)
	popularity <- as.data.frame(lapply(gotChars, function(x){
	as.integer(sapply(tkn, function(k){any(k %in% x)}))
	}))

	# Write colnames
	colnames(popularity) <- gotChars

	# Add column with corresponding EST time
	popularity$created_at <- allTweets$created_at

monogenea / 9-poissonGoT.R

Created October 7, 2019 16:48

	# Sat Oct 5 10:06:01 2019 ------------------------------
	# Bonus - rm bots, time-dependend wordclouds & sentiment analysis
	rtStats <- do.call("rbind", by(allTweets, INDICES = allTweets$screen_name, function(x){
	return(data.frame(num_tweets = nrow(x),
	mean_followers = mean(x$followers_count),
	median_rt = median(x$retweet_count)))
	}))

	# Plot log10(num_tweets) vs. log10(median_rt)
	with(log10(rtStats+1), plot(num_tweets, median_rt,

monogenea / 10-poissonGoT.R

Created October 7, 2019 16:51

	# Wordcloud
	# Remove potential bots w/ > 100 tweets in the dataset
	bots <- rownames(rtStats)[which(rtStats$num_tweets > 100)]
	reducedTweet <- allTweets[!allTweets$screen_name %in% bots,]
	reducedTweet$text <- texts(reducedTweet$text) %>%
	iconv(from = "UTF-8", to = "ASCII", sub = "") %>%
	gsub(pattern = "<[A-Z+0-9]+>", repl = " ")

	# Tokenize words
	tkn <- tokens(reducedTweet$text,