Skip to content

Instantly share code, notes, and snippets.

@diamonaj
Created February 3, 2023 16:02
Show Gist options
  • Select an option

  • Save diamonaj/431770e5fa3663b06663a9dbd02b1c34 to your computer and use it in GitHub Desktop.

Select an option

Save diamonaj/431770e5fa3663b06663a9dbd02b1c34 to your computer and use it in GitHub Desktop.
### This exercise requires installing a bunch of packages---
### Unfortunately, the precise sequence and rules for installing may vary
### depending upon your computer and configuration.
## ***Taken from Chapter 5 in Kosuke Imai's "Quantitative Social Science"
## Transcribed by Alexis Diamond, all errors my own...
##########################################################################
## First install "tm" and "Snowball", and load the libraries
install.packages("tm") ### this will prompt you for "Yes, no, cancel"
### try "Yes:. If that fails, then retry with "no".
library(tm)
install.packages("SnowballC")
library(SnowballC)
# once you have the above installed, you can proceed to the steps below:
#install.packages("devtools") # if you have not installed devtools already
#devtools::install_github("kosukeimai/qss-package", build_vignettes = TRUE)
#install.packages("qss")
#library(qss)
###
####### STEP 1
# load the text (we call a bunch of text a corpus)
# federalist_dir <- system.file("extdata", "federalist", package = "qss")
# see all the files
#dir(federalist_dir)
#corpus.raw <- Corpus(DirSource(federalist_dir))
corpus.raw <- Corpus(DirSource(directory = "federalist", pattern = "fp"))
# this corpus comes with many different text files built in
# to see text, use "content()" and specify which doc (e.g., the 1st one)
content(corpus.raw[[1]])
####### GET THE DATA IN SHAPE
# make lower case
corpus.prep <- tm_map(corpus.raw, content_transformer(tolower))
# remove white space
corpus.prep <- tm_map(corpus.prep, stripWhitespace)
# remove punctuation
corpus.prep <- tm_map(corpus.prep, removePunctuation)
# remove numbers
corpus.prep <- tm_map(corpus.prep, removeNumbers)
# we will want to remove the most commonly used words such as 'a' and 'the'
# let's get a list of those words, in English...
head(stopwords("english"))
# and now let's remove those 'stopwords'
corpus.prep <- tm_map(corpus.prep, removeWords, stopwords("english"))
# finally, stem the remaining words
corpus.stemmed <- tm_map(corpus.prep, stemDocument)
############################################################
## the 'content()' function prints the actual text of a doc
## we can select specific essays using double-square brackets...
## for example, to extract essay 10:
content(corpus.stemmed[[10]])
# compare this preprocessed doc with the corresponding section of the original text:
content(corpus.raw[[1]])
####### STEP 2 ----- word frequencies!!!
# DOCUMENT-TERM MATRIX
# how often are words (word-stems) used across all the docs
dtm <- DocumentTermMatrix(corpus.stemmed)
inspect(dtm[1:5, 1:8])
dtm.mat <- as.matrix(dtm)
####### STEP 3 ----- visualizing the high-frequency words
library(wordcloud)
wordcloud(colnames(dtm.mat), dtm.mat[12, ], max.words = 20) #essay no. 12
wordcloud(colnames(dtm.mat), dtm.mat[24, ], max.words = 20) #essay no. 24
# If we forget what a word stem refers to then we can find out
stemCompletion(c("revenu", "commerc",
"peac", "army"), corpus.prep)
## Here's a way of figuring out how important a word is, in a particular doc
dtm.tfidf <- weightTfIdf(dtm) # tf-idf calculation (an importance measure)
dtm.tfidf.mat <- as.matrix(dtm.tfidf) #convert to matrix
## 10 most important words for paper no. 12
head(sort(dtm.tfidf.mat[12, ], decreasing = TRUE), n = 10)
## 10 most important words for paper no. 24
head(sort(dtm.tfidf.mat[24, ], decreasing = TRUE), n = 10)
######################## STEP 4: Authorship Prediction
## Authorship prediction
## authorship of some Federalist Papers is unknown
## We use the 66 essays attributed to either Hamilton or Madison to
## predict the authorship of the 11 disputed papers.
## Since each paper deals with a different topic, we focus on usage of articles,
## prepositions, and conjuctions. We analyze the frequency of the following
## 10 words: although, always, commonly, consequently, considerable, enough, there, upon, while,
## and whilst.
## We selected these words based upon an academic paper that inspired this exercise.
## As a result, we must the unstemmed corpus, corpus prep.
## We first compute the the term frequency (per 1000 words) separately for each term
## and document and then subset the resulting term-frequency matrix to
## contain only these words.
dtm1 <- as.matrix(DocumentTermMatrix(corpus.prep))
tfrequency <- dtm1 / rowSums(dtm1) # term frequency
tfm <- tfrequency*1000 # term freqency times 1000 (i.e., per 1000 words)
## words of interest
words <- c("although", "always", "commonly", "consequently",
"considerable", "enough", "there", "upon", "while",
"whilst")
## select only these words
#tfm <- tfm[, colnames(tfm) == words]
tfm <- tfm[, which(is.element(colnames(tfm), words))]
## We can then calculate the average term frequency separately for
## Hamilton and Madison across each author's entire body of documents.
hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85)
madison <- c(10, 14, 37:48, 58) #
## average among Hamilton/Madison essays
tfm.ave <- rbind(colSums(tfm[hamilton, ]) / length(hamilton),
colSums(tfm[madison, ]) / length(madison))
tfm.ave
########
######################## STEP 4b: Running regressions -- Authorship Prediction
## We're going to be running regressions...
## If a predicted value is positive, we're going to say it's a prediction for hamilton authorship.
## If a predicted value is negative, we're going to say it's a prediction for madison authorship.
author <- rep(NA, nrow(dtm1)) # a vector with a missing value
author[hamilton] <- 1 # 1 if Hamilton
author[madison] <- -1 # -1 if Madison
## data frame for regression
author.data <- data.frame(author = author[c(hamilton, madison)],
tfm[c(hamilton, madison), ])
head(author.data)
## To predict the authorship, we use the term frequency of the 4 words
## selected based on our preliminary aanalysis ('upon', 'there',
## 'consequently', and 'whilst'. The data frame object we created above contains
## the term frequency of the 10 words including these 4. We estimate the
## coefficients using the 66 essays with known authorship.
hm.fit <- lm(author ~ enough + upon + although + whilst + always + commonly
+ consequently + considerable, data = author.data)
hm.fit
confint(hm.fit)
hm.fitted <- fitted(hm.fit) # fitted values
sd (hm.fitted)
cat("\nHamilton misclassified: ", hm.fitted[author.data$author == 1] < 0, "\n")
cat("\nMadison misclassified: ", hm.fitted[author.data$author == -1] > 0, "\n")
############# PART 4c: TA-DA! The disputed essays!
dispute <- c(49, 50:57, 62, 63) # 11 essays with disputed authorship
## remember -- we have the term frequencies across ALL texts (including these)
## let's grab the tfm data in these disputed texts, & put it all in a dataframe
tf.disputed <- as.data.frame(tfm[disputed,])
# now we can use tgis data to run a regression and obtained predicted values
# and predict the authors of the disputed texts.
pred <- predict(hm.fit, newdata = tf.disputed)
## proportion of correctly classified essays by Hamilton
mean(hm.fitted[author.data$author == 1] > 0)
## proportion of correctly classified essays by Madison
mean(hm.fitted[author.data$author == -1] < 0)
n <- nrow(author.data)
hm.classify <- rep(NA, n) # a container vector with missing values
for (i in 1:n) {
## fit the model to the data after removing the ith observation
sub.fit <- lm(author ~ .,
data = author.data[-i, ]) # exclude ith row
## predict the authorship for the ith observation
hm.classify[i] <- predict(sub.fit, newdata = author.data[i, ])
}
## fitted values for essays authored by Hamilton; red squares
plot(hamilton, hm.fitted[author.data$author == 1], pch = 15,
xlim = c(1, 85), ylim = c(-2, 2), col = "red",
xlab = "Federalist Papers", ylab = "Predicted values")
abline(h = 0, lty = "dashed")
## essays authored by Madison; blue circles
points(madison, hm.fitted[author.data$author == -1],
pch = 16, col = "blue")
## disputed authorship; black triangles
points(disputed, pred, pch = 17)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment