benmarwick · December 16, 2015 05:08
diff --git a/JSTOR-DfR-words-over-time.R b/JSTOR-DfR-words-over-time.R
 #### prepare workspace
 rm(list = ls(all.names = TRUE))
 gc()

 #### get data into the R session 
 # set R's working directory
 setwd("C:/Users/Marwick/Downloads/JSTOR") # change this to where you downloaded the data!
 # Get zip file of CSVs from JSTOR and unzip
 # this may take a few minutes...
 unzip("2013.4.20.FxFmBVYd.zip")
 # set working directory to newly created folder
 # (within working directory) with lots of CSV files
 setwd(paste0(getwd(),"/wordcounts"))

 #### get list of data, the CSV files of wordcounts in dropbox folder
 myfiles <- dir(pattern = "\\.(csv|CSV)$", full.names = TRUE)
 # read CSV files into a R data object
 library(plyr)
 system.time(aawc <<-  llply(myfiles, read.csv, .progress = "text", .inform = FALSE))
 # assign file names to each dataframe in the list
 names(aawc) <- myfiles

 #### reshape data
 # `untable' each CSV file into a list of data frames, one data frame per file
 system.time(aawc1 <<- sapply(1:length(aawc), function(x) {rep(aawc[[x]]$WORDCOUNTS, times = aawc[[x]]$WEIGHT)}))
 names(aawc1) <- myfiles

 #### bring in citations file with biblio data for each paper
 setwd("C:/Users/marwick/Desktop/") # change this to the location of the citations.csv file!
 cit <- read.csv("citations.CSV")
 # replace for-slash with underscore to make it match the filenames
 # and replace odd \t that was added during import 
 library(stringr)
 cit$id <- str_extract(chartr('/', '_', cit$id), ".*[^\t]")
 # limit list of citations to full length articles only 
 # note that citation type is not in the correct column
 # and that we need \t in there also
 citfla <- cit[cit$publisher == 'fla\t',]
 # subset from the wordcount data only the full length articles
 # remove characters from the file names that are not in the citation list
 # to enable matching with citation IDs
 library(stringr)
 names(aawc1) <- str_extract(basename(names(aawc1)), "[^wordcounts_].+[^.CSV]")
 # subset items in the list of wordcount data whose names are in 
 # the list of fla citation IDs
 aawc2 <- aawc1[which(names(aawc1) %in% citfla$id)]
 # put citation IDs in order with wordcount data names  
 citfla1 <- (merge(names(aawc2), citfla, by.x=1, by.y="id"))
 # create a variable that holds the year of publication for
 # each article
 citfla1$year <- str_extract(citfla1$issue, "[[:digit:]]+{4}")
 # now we have a table of citations with a unique ID for each article
 # that is linked to the year of publication. We can come back to this

 #### investigate change in use of certain words of interest over time
 # FIRST get counts of 'the' to standardise for varying lengths of articles
 w1 <- 'lithic'
 leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
 # now word of interest (always lower case)
 word <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w1))
 # calculate ratio
 word_ratio <- word/leng
 # get years for each article
 word_by_year <- data.frame(word_ratio, year = as.numeric(as.character(citfla1$year)))
 # vizualise
 # first just one word
 library(ggplot2)
 ggplot(word_by_year, aes(year, log(word_ratio))) +
  geom_point(subset = .(word_ratio > 0)) +
  geom_smooth( aes(group=1), method = "loess", span = 0.4, data=subset(word_by_year, word_ratio>0)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab(paste0("log of frequency of the word '", w1, "'")) +
  # inspect citfla1$year to see min and max year to set axis limits
  scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2))

 # SECOND, try comparing two words
 # now word of interest (always lower case)
 w1 <- 'binford'
 w2 <- 'wylie'
 word1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w1))
 word2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w2))
 leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
 # calculate ratios
 word1_ratio <- word1/leng
 word2_ratio <- word2/leng
 # get years for each article and make data frame
 twowords_by_year <- data.frame(word1_ratio, word2_ratio, year = as.numeric(as.character(citfla1$year)))
 # reshape into a long table to make it easier to work with in ggplt
 library(reshape2)
 twowords_by_year_melt <- melt(twowords_by_year, id.vars = "year")
 # visualise
 library(ggplot2)
 ggplot(twowords_by_year_melt, aes(year, log(value))) +
  geom_point(subset = .(value > 0), aes(colour = variable)) +
  geom_smooth( aes(colour = variable), method = "loess", span = 0.4, subset = .(value > 0)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab(paste0("log of word frequency")) +
  scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
  scale_colour_discrete(labels = c(w1, w2))

 # THIRD, try comparing vectors of related words
 # now word of interest (always lower case)
 wv1 <- c('he', 'him', 'his')
 wv2 <- c('she', 'her', 'hers')
 wordv1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% wv1))
 wordv2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% wv2))
 leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
 # calculate ratios
 wordv1_ratio <- wordv1/leng
 wordv2_ratio <- wordv2/leng
 # get years for each article and make data frame
 twowordvs_by_year <- data.frame(wordv1_ratio, wordv2_ratio, year = as.numeric(as.character(citfla1$year)))
 # reshape into a long table to make it easier to work with in ggplt
 library(reshape2)
 twowordvs_by_year_melt <- melt(twowordvs_by_year, id.vars = "year")
 # visualise
 library(ggplot2)
 ggplot(twowordvs_by_year_melt, aes(year, log(value))) +
  geom_point(subset = .(value > 0), aes(colour = variable)) +
  geom_smooth( aes(colour = variable), method = "loess", span = 0.4, subset = .(value > 0)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
  ylab(paste0("log of frequency of words")) +
  scale_colour_discrete(labels = c(paste(wv1, collapse = ", "), paste(wv2, collapse = ", ")))


 ## FOURTH investigate correlations between words over time
 cw1 <- 'gender'
 cw2 <- 'she'
 cword1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% cw1))
 cword2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% cw2))
 leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
 # calculate ratios
 cword1_ratio <- cword1/leng
 cword2_ratio <- cword2/leng
 # get years for each article and make data frame
 ctwowords_by_year <- data.frame(ww1 = cword1_ratio, ww2 = cword2_ratio, year = as.numeric(as.character(citfla1$year)))
 # calculate correlations of the two words per year (and p-values)
 library(plyr)
 corrp <- ddply(ctwowords_by_year, .(year), summarize, "corr" = cor.test(ww1, ww2)$estimate, "pval" = cor.test(ww1, ww2)$p.value)
 # visualise
 library(ggplot2)
 ggplot(corrp, aes(year, corr)) +
  geom_point(aes(size = -pval)) +
  geom_smooth(  method = "loess", span = 0.4) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  geom_hline(yintercept=0, colour = "red") + 
  ylab(paste0("correlation between '",cw1, "'' and '", cw2,"'")) +
  scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
  scale_size_continuous("p-values", breaks = c(-0.75, -0.25, -0.05, -0.001), labels = c(0.75, 0.25, 0.05, 0.001))
	#### prepare workspace
	rm(list = ls(all.names = TRUE))
	gc()

	#### get data into the R session
	# set R's working directory
	setwd("C:/Users/Marwick/Downloads/JSTOR") # change this to where you downloaded the data!
	# Get zip file of CSVs from JSTOR and unzip
	# this may take a few minutes...
	unzip("2013.4.20.FxFmBVYd.zip")
	# set working directory to newly created folder
	# (within working directory) with lots of CSV files
	setwd(paste0(getwd(),"/wordcounts"))

	#### get list of data, the CSV files of wordcounts in dropbox folder
	myfiles <- dir(pattern = "\\.(csv\|CSV)$", full.names = TRUE)
	# read CSV files into a R data object
	library(plyr)
	system.time(aawc <<- llply(myfiles, read.csv, .progress = "text", .inform = FALSE))
	# assign file names to each dataframe in the list
	names(aawc) <- myfiles

	#### reshape data
	# `untable' each CSV file into a list of data frames, one data frame per file
	system.time(aawc1 <<- sapply(1:length(aawc), function(x) {rep(aawc[[x]]$WORDCOUNTS, times = aawc[[x]]$WEIGHT)}))
	names(aawc1) <- myfiles

	#### bring in citations file with biblio data for each paper
	setwd("C:/Users/marwick/Desktop/") # change this to the location of the citations.csv file!
	cit <- read.csv("citations.CSV")
	# replace for-slash with underscore to make it match the filenames
	# and replace odd \t that was added during import
	library(stringr)
	cit$id <- str_extract(chartr('/', '_', cit$id), ".*[^\t]")
	# limit list of citations to full length articles only
	# note that citation type is not in the correct column
	# and that we need \t in there also
	citfla <- cit[cit$publisher == 'fla\t',]
	# subset from the wordcount data only the full length articles
	# remove characters from the file names that are not in the citation list
	# to enable matching with citation IDs
	library(stringr)
	names(aawc1) <- str_extract(basename(names(aawc1)), "[^wordcounts_].+[^.CSV]")
	# subset items in the list of wordcount data whose names are in
	# the list of fla citation IDs
	aawc2 <- aawc1[which(names(aawc1) %in% citfla$id)]
	# put citation IDs in order with wordcount data names
	citfla1 <- (merge(names(aawc2), citfla, by.x=1, by.y="id"))
	# create a variable that holds the year of publication for
	# each article
	citfla1$year <- str_extract(citfla1$issue, "[[:digit:]]+{4}")
	# now we have a table of citations with a unique ID for each article
	# that is linked to the year of publication. We can come back to this

	#### investigate change in use of certain words of interest over time
	# FIRST get counts of 'the' to standardise for varying lengths of articles
	w1 <- 'lithic'
	leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
	# now word of interest (always lower case)
	word <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w1))
	# calculate ratio
	word_ratio <- word/leng
	# get years for each article
	word_by_year <- data.frame(word_ratio, year = as.numeric(as.character(citfla1$year)))
	# vizualise
	# first just one word
	library(ggplot2)
	ggplot(word_by_year, aes(year, log(word_ratio))) +
	geom_point(subset = .(word_ratio > 0)) +
	geom_smooth( aes(group=1), method = "loess", span = 0.4, data=subset(word_by_year, word_ratio>0)) +
	theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
	ylab(paste0("log of frequency of the word '", w1, "'")) +
	# inspect citfla1$year to see min and max year to set axis limits
	scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2))

	# SECOND, try comparing two words
	# now word of interest (always lower case)
	w1 <- 'binford'
	w2 <- 'wylie'
	word1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w1))
	word2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w2))
	leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
	# calculate ratios
	word1_ratio <- word1/leng
	word2_ratio <- word2/leng
	# get years for each article and make data frame
	twowords_by_year <- data.frame(word1_ratio, word2_ratio, year = as.numeric(as.character(citfla1$year)))
	# reshape into a long table to make it easier to work with in ggplt
	library(reshape2)
	twowords_by_year_melt <- melt(twowords_by_year, id.vars = "year")
	# visualise
	library(ggplot2)
	ggplot(twowords_by_year_melt, aes(year, log(value))) +
	geom_point(subset = .(value > 0), aes(colour = variable)) +
	geom_smooth( aes(colour = variable), method = "loess", span = 0.4, subset = .(value > 0)) +
	theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
	ylab(paste0("log of word frequency")) +
	scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
	scale_colour_discrete(labels = c(w1, w2))

	# THIRD, try comparing vectors of related words
	# now word of interest (always lower case)
	wv1 <- c('he', 'him', 'his')
	wv2 <- c('she', 'her', 'hers')
	wordv1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% wv1))
	wordv2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% wv2))
	leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
	# calculate ratios
	wordv1_ratio <- wordv1/leng
	wordv2_ratio <- wordv2/leng
	# get years for each article and make data frame
	twowordvs_by_year <- data.frame(wordv1_ratio, wordv2_ratio, year = as.numeric(as.character(citfla1$year)))
	# reshape into a long table to make it easier to work with in ggplt
	library(reshape2)
	twowordvs_by_year_melt <- melt(twowordvs_by_year, id.vars = "year")
	# visualise
	library(ggplot2)
	ggplot(twowordvs_by_year_melt, aes(year, log(value))) +
	geom_point(subset = .(value > 0), aes(colour = variable)) +
	geom_smooth( aes(colour = variable), method = "loess", span = 0.4, subset = .(value > 0)) +
	theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
	scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
	ylab(paste0("log of frequency of words")) +
	scale_colour_discrete(labels = c(paste(wv1, collapse = ", "), paste(wv2, collapse = ", ")))


	## FOURTH investigate correlations between words over time
	cw1 <- 'gender'
	cw2 <- 'she'
	cword1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% cw1))
	cword2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% cw2))
	leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
	# calculate ratios
	cword1_ratio <- cword1/leng
	cword2_ratio <- cword2/leng
	# get years for each article and make data frame
	ctwowords_by_year <- data.frame(ww1 = cword1_ratio, ww2 = cword2_ratio, year = as.numeric(as.character(citfla1$year)))
	# calculate correlations of the two words per year (and p-values)
	library(plyr)
	corrp <- ddply(ctwowords_by_year, .(year), summarize, "corr" = cor.test(ww1, ww2)$estimate, "pval" = cor.test(ww1, ww2)$p.value)
	# visualise
	library(ggplot2)
	ggplot(corrp, aes(year, corr)) +
	geom_point(aes(size = -pval)) +
	geom_smooth( method = "loess", span = 0.4) +
	theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
	geom_hline(yintercept=0, colour = "red") +
	ylab(paste0("correlation between '",cw1, "'' and '", cw2,"'")) +
	scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
	scale_size_continuous("p-values", breaks = c(-0.75, -0.25, -0.05, -0.001), labels = c(0.75, 0.25, 0.05, 0.001))