Last active
December 16, 2015 05:08
-
-
Save benmarwick/5381856 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### prepare workspace | |
rm(list = ls(all.names = TRUE)) | |
gc() | |
#### get data into the R session | |
# set R's working directory | |
setwd("C:/Users/Marwick/Downloads/JSTOR") # change this to where you downloaded the data! | |
# Get zip file of CSVs from JSTOR and unzip | |
# this may take a few minutes... | |
unzip("2013.4.20.FxFmBVYd.zip") | |
# set working directory to newly created folder | |
# (within working directory) with lots of CSV files | |
setwd(paste0(getwd(),"/wordcounts")) | |
#### get list of data, the CSV files of wordcounts in dropbox folder | |
myfiles <- dir(pattern = "\\.(csv|CSV)$", full.names = TRUE) | |
# read CSV files into a R data object | |
library(plyr) | |
system.time(aawc <<- llply(myfiles, read.csv, .progress = "text", .inform = FALSE)) | |
# assign file names to each dataframe in the list | |
names(aawc) <- myfiles | |
#### reshape data | |
# `untable' each CSV file into a list of data frames, one data frame per file | |
system.time(aawc1 <<- sapply(1:length(aawc), function(x) {rep(aawc[[x]]$WORDCOUNTS, times = aawc[[x]]$WEIGHT)})) | |
names(aawc1) <- myfiles | |
#### bring in citations file with biblio data for each paper | |
setwd("C:/Users/marwick/Desktop/") # change this to the location of the citations.csv file! | |
cit <- read.csv("citations.CSV") | |
# replace for-slash with underscore to make it match the filenames | |
# and replace odd \t that was added during import | |
library(stringr) | |
cit$id <- str_extract(chartr('/', '_', cit$id), ".*[^\t]") | |
# limit list of citations to full length articles only | |
# note that citation type is not in the correct column | |
# and that we need \t in there also | |
citfla <- cit[cit$publisher == 'fla\t',] | |
# subset from the wordcount data only the full length articles | |
# remove characters from the file names that are not in the citation list | |
# to enable matching with citation IDs | |
library(stringr) | |
names(aawc1) <- str_extract(basename(names(aawc1)), "[^wordcounts_].+[^.CSV]") | |
# subset items in the list of wordcount data whose names are in | |
# the list of fla citation IDs | |
aawc2 <- aawc1[which(names(aawc1) %in% citfla$id)] | |
# put citation IDs in order with wordcount data names | |
citfla1 <- (merge(names(aawc2), citfla, by.x=1, by.y="id")) | |
# create a variable that holds the year of publication for | |
# each article | |
citfla1$year <- str_extract(citfla1$issue, "[[:digit:]]+{4}") | |
# now we have a table of citations with a unique ID for each article | |
# that is linked to the year of publication. We can come back to this | |
#### investigate change in use of certain words of interest over time | |
# FIRST get counts of 'the' to standardise for varying lengths of articles | |
w1 <- 'lithic' | |
leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words | |
# now word of interest (always lower case) | |
word <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w1)) | |
# calculate ratio | |
word_ratio <- word/leng | |
# get years for each article | |
word_by_year <- data.frame(word_ratio, year = as.numeric(as.character(citfla1$year))) | |
# vizualise | |
# first just one word | |
library(ggplot2) | |
ggplot(word_by_year, aes(year, log(word_ratio))) + | |
geom_point(subset = .(word_ratio > 0)) + | |
geom_smooth( aes(group=1), method = "loess", span = 0.4, data=subset(word_by_year, word_ratio>0)) + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | |
ylab(paste0("log of frequency of the word '", w1, "'")) + | |
# inspect citfla1$year to see min and max year to set axis limits | |
scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) | |
# SECOND, try comparing two words | |
# now word of interest (always lower case) | |
w1 <- 'binford' | |
w2 <- 'wylie' | |
word1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w1)) | |
word2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% w2)) | |
leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words | |
# calculate ratios | |
word1_ratio <- word1/leng | |
word2_ratio <- word2/leng | |
# get years for each article and make data frame | |
twowords_by_year <- data.frame(word1_ratio, word2_ratio, year = as.numeric(as.character(citfla1$year))) | |
# reshape into a long table to make it easier to work with in ggplt | |
library(reshape2) | |
twowords_by_year_melt <- melt(twowords_by_year, id.vars = "year") | |
# visualise | |
library(ggplot2) | |
ggplot(twowords_by_year_melt, aes(year, log(value))) + | |
geom_point(subset = .(value > 0), aes(colour = variable)) + | |
geom_smooth( aes(colour = variable), method = "loess", span = 0.4, subset = .(value > 0)) + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | |
ylab(paste0("log of word frequency")) + | |
scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) + | |
scale_colour_discrete(labels = c(w1, w2)) | |
# THIRD, try comparing vectors of related words | |
# now word of interest (always lower case) | |
wv1 <- c('he', 'him', 'his') | |
wv2 <- c('she', 'her', 'hers') | |
wordv1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% wv1)) | |
wordv2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% wv2)) | |
leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words | |
# calculate ratios | |
wordv1_ratio <- wordv1/leng | |
wordv2_ratio <- wordv2/leng | |
# get years for each article and make data frame | |
twowordvs_by_year <- data.frame(wordv1_ratio, wordv2_ratio, year = as.numeric(as.character(citfla1$year))) | |
# reshape into a long table to make it easier to work with in ggplt | |
library(reshape2) | |
twowordvs_by_year_melt <- melt(twowordvs_by_year, id.vars = "year") | |
# visualise | |
library(ggplot2) | |
ggplot(twowordvs_by_year_melt, aes(year, log(value))) + | |
geom_point(subset = .(value > 0), aes(colour = variable)) + | |
geom_smooth( aes(colour = variable), method = "loess", span = 0.4, subset = .(value > 0)) + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | |
scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) + | |
ylab(paste0("log of frequency of words")) + | |
scale_colour_discrete(labels = c(paste(wv1, collapse = ", "), paste(wv2, collapse = ", "))) | |
## FOURTH investigate correlations between words over time | |
cw1 <- 'gender' | |
cw2 <- 'she' | |
cword1 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% cw1)) | |
cword2 <- sapply(1:length(aawc2), function(i) sum(aawc2[[i]] %in% cw2)) | |
leng <- sapply(1:length(aawc2), function(i) length(aawc2[[i]])) # use all words | |
# calculate ratios | |
cword1_ratio <- cword1/leng | |
cword2_ratio <- cword2/leng | |
# get years for each article and make data frame | |
ctwowords_by_year <- data.frame(ww1 = cword1_ratio, ww2 = cword2_ratio, year = as.numeric(as.character(citfla1$year))) | |
# calculate correlations of the two words per year (and p-values) | |
library(plyr) | |
corrp <- ddply(ctwowords_by_year, .(year), summarize, "corr" = cor.test(ww1, ww2)$estimate, "pval" = cor.test(ww1, ww2)$p.value) | |
# visualise | |
library(ggplot2) | |
ggplot(corrp, aes(year, corr)) + | |
geom_point(aes(size = -pval)) + | |
geom_smooth( method = "loess", span = 0.4) + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | |
geom_hline(yintercept=0, colour = "red") + | |
ylab(paste0("correlation between '",cw1, "'' and '", cw2,"'")) + | |
scale_x_continuous(limits=c(1935, 2009), breaks = seq(1930, 2010, 2)) + | |
scale_size_continuous("p-values", breaks = c(-0.75, -0.25, -0.05, -0.001), labels = c(0.75, 0.25, 0.05, 0.001)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment