Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active December 16, 2015 05:08
Show Gist options
  • Save benmarwick/5381856 to your computer and use it in GitHub Desktop.
Save benmarwick/5381856 to your computer and use it in GitHub Desktop.
#### prepare workspace
rm(list = ls(all.names = TRUE))
gc()
#### get data into the R session
# set R's working directory
setwd("C:/Users/Marwick/Downloads/JSTOR") # change this to where you downloaded the data!
# Get zip file of CSVs from JSTOR and unzip
# this may take a few minutes...
unzip("2013.4.20.FxFmBVYd.zip")
# set working directory to newly created folder
# (within working directory) with lots of CSV files
setwd(paste0(getwd(),"/wordcounts"))
#### get list of data, the CSV files of wordcounts in dropbox folder
myfiles <- dir(pattern = "\\.(csv|CSV)$", full.names = TRUE)
# read CSV files into a R data object
library(plyr)
system.time(aawc <<- llply(myfiles, read.csv, .progress = "text", .inform = FALSE))
# assign file names to each dataframe in the list
names(aawc) <- myfiles
#### reshape data
# `untable' each CSV file into a list of data frames, one data frame per file
system.time(aawc1 <<- sapply(1:length(aawc), function(x) {rep(aawc[[x]]$WORDCOUNTS, times = aawc[[x]]$WEIGHT)}))
names(aawc1) <- myfiles
#### bring in citations file with biblio data for each paper
setwd("C:/Users/marwick/Desktop/") # change this to the location of the citations.csv file!
cit <- read.csv("citations.CSV")
# replace for-slash with underscore to make it match the filenames
# and replace odd \t that was added during import
library(stringr)
cit$id <- str_extract(chartr('/', '_', cit$id), ".*[^\t]")
# limit list of citations to full length articles only
# note that citation type is not in the correct column
# and that we need \t in there also
citfla <- cit[cit$publisher == 'fla\t',]
# subset from the wordcount data only the full length articles
# remove characters from the file names that are not in the citation list
# to enable matching with citation IDs
library(stringr)
names(aawc1) <- str_extract(basename(names(aawc1)), "[^wordcounts_].+[^.CSV]")
# subset items in the list of wordcount data whose names are in
# the list of fla citation IDs
aawc2 <- aawc1[which(names(aawc1) %in% citfla$id)]
# put citation IDs in order with wordcount data names
citfla1 <- (merge(names(aawc2), citfla, by.x=1, by.y="id"))
# create a variable that holds the year of publication for
# each article
citfla1$year <- str_extract(citfla1$issue, "[[:digit:]]+{4}")
# now we have a table of citations with a unique ID for each article
# that is linked to the year of publication. We can come back to this
#### investigate change in use of certain words of interest over time
# FIRST get counts of 'the' to standardise for varying lengths of articles
w1 <- 'lithic'
leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
# now word of interest (always lower case)
word <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w1))
# calculate ratio
word_ratio <- word/leng
# get years for each article
word_by_year <- data.frame(word_ratio, year = as.numeric(as.character(citfla1$year)))
# vizualise
# first just one word
library(ggplot2)
ggplot(word_by_year, aes(year, log(word_ratio))) +
geom_point(subset = .(word_ratio > 0)) +
geom_smooth( aes(group=1), method = "loess", span = 0.4, data=subset(word_by_year, word_ratio>0)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab(paste0("log of frequency of the word '", w1, "'")) +
# inspect citfla1$year to see min and max year to set axis limits
scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2))
# SECOND, try comparing two words
# now word of interest (always lower case)
w1 <- 'binford'
w2 <- 'wylie'
word1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w1))
word2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w2))
leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
# calculate ratios
word1_ratio <- word1/leng
word2_ratio <- word2/leng
# get years for each article and make data frame
twowords_by_year <- data.frame(word1_ratio, word2_ratio, year = as.numeric(as.character(citfla1$year)))
# reshape into a long table to make it easier to work with in ggplt
library(reshape2)
twowords_by_year_melt <- melt(twowords_by_year, id.vars = "year")
# visualise
library(ggplot2)
ggplot(twowords_by_year_melt, aes(year, log(value))) +
geom_point(subset = .(value > 0), aes(colour = variable)) +
geom_smooth( aes(colour = variable), method = "loess", span = 0.4, subset = .(value > 0)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab(paste0("log of word frequency")) +
scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
scale_colour_discrete(labels = c(w1, w2))
# THIRD, try comparing vectors of related words
# now word of interest (always lower case)
wv1 <- c('he', 'him', 'his')
wv2 <- c('she', 'her', 'hers')
wordv1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% wv1))
wordv2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% wv2))
leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
# calculate ratios
wordv1_ratio <- wordv1/leng
wordv2_ratio <- wordv2/leng
# get years for each article and make data frame
twowordvs_by_year <- data.frame(wordv1_ratio, wordv2_ratio, year = as.numeric(as.character(citfla1$year)))
# reshape into a long table to make it easier to work with in ggplt
library(reshape2)
twowordvs_by_year_melt <- melt(twowordvs_by_year, id.vars = "year")
# visualise
library(ggplot2)
ggplot(twowordvs_by_year_melt, aes(year, log(value))) +
geom_point(subset = .(value > 0), aes(colour = variable)) +
geom_smooth( aes(colour = variable), method = "loess", span = 0.4, subset = .(value > 0)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
ylab(paste0("log of frequency of words")) +
scale_colour_discrete(labels = c(paste(wv1, collapse = ", "), paste(wv2, collapse = ", ")))
## FOURTH investigate correlations between words over time
cw1 <- 'gender'
cw2 <- 'she'
cword1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% cw1))
cword2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% cw2))
leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words
# calculate ratios
cword1_ratio <- cword1/leng
cword2_ratio <- cword2/leng
# get years for each article and make data frame
ctwowords_by_year <- data.frame(ww1 = cword1_ratio, ww2 = cword2_ratio, year = as.numeric(as.character(citfla1$year)))
# calculate correlations of the two words per year (and p-values)
library(plyr)
corrp <- ddply(ctwowords_by_year, .(year), summarize, "corr" = cor.test(ww1, ww2)$estimate, "pval" = cor.test(ww1, ww2)$p.value)
# visualise
library(ggplot2)
ggplot(corrp, aes(year, corr)) +
geom_point(aes(size = -pval)) +
geom_smooth( method = "loess", span = 0.4) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_hline(yintercept=0, colour = "red") +
ylab(paste0("correlation between '",cw1, "'' and '", cw2,"'")) +
scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) +
scale_size_continuous("p-values", breaks = c(-0.75, -0.25, -0.05, -0.001), labels = c(0.75, 0.25, 0.05, 0.001))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment