## http://polygraph.cool/films/ ## https://github.com/matthewfdaniels/scripts x <- "https://raw.githubusercontent.com/matthewfdaniels/scripts/master/data/character_list5.csv" characters <- read.csv(x, na.strings = c("NULL", "?"), fileEncoding = "ISO-8859-1", stringsAsFactors = FALSE) ## some ages are clearly (negative) birth years ... oops characters$age[!is.na(characters$age) & characters$age < 0] <- NA characters$age[!is.na(characters$age) & characters$age > 105] <- NA y <- "https://raw.githubusercontent.com/matthewfdaniels/scripts/master/data/meta_data7.csv" films <- read.csv(y, fileEncoding = "ISO-8859-1", stringsAsFactors = FALSE, colClasses = list(lines_data = NULL)) ## setequal(characters$script_id, films$script_id) ## wow, a pleasant surprise df <- merge(characters, films) write.table(df, "characters_with_film.csv", sep = ",", row.names = FALSE) library(ggplot2) ggplot(subset(df, !is.na(gender)), aes(x = age, colour = gender)) + geom_density()