diamonaj · February 3, 2023 16:02
diff --git a/federalist.R b/federalist.R

 ### This exercise requires installing a bunch of packages---
 ### Unfortunately, the precise sequence and rules for installing may vary
 ### depending upon your computer and configuration.

 ## ***Taken from Chapter 5 in Kosuke Imai's "Quantitative Social Science"
 ## Transcribed by Alexis Diamond, all errors my own...

 ##########################################################################

 ## First install "tm" and "Snowball", and load the libraries

 install.packages("tm")  ### this will prompt you for "Yes, no, cancel"
                        ### try "Yes:. If that fails, then retry with "no".
 library(tm)

 install.packages("SnowballC")
 library(SnowballC)

 # once you have the above installed, you can proceed to the steps below:

 #install.packages("devtools") # if you have not installed devtools already
 #devtools::install_github("kosukeimai/qss-package", build_vignettes = TRUE)

 #install.packages("qss")
 #library(qss)

 ###
 ####### STEP 1
 # load the text (we call a bunch of text a corpus) 
 # federalist_dir <- system.file("extdata", "federalist", package = "qss")

 # see all the files
 #dir(federalist_dir)
 #corpus.raw <- Corpus(DirSource(federalist_dir))

 corpus.raw <- Corpus(DirSource(directory = "federalist", pattern = "fp"))

 # this corpus comes with many different text files built in
 # to see text, use "content()" and specify which doc (e.g., the 1st one)
 content(corpus.raw[[1]])

 ####### GET THE DATA IN SHAPE
 # make lower case
 corpus.prep <- tm_map(corpus.raw, content_transformer(tolower))

 # remove white space
 corpus.prep <- tm_map(corpus.prep, stripWhitespace)

 # remove punctuation
 corpus.prep <- tm_map(corpus.prep, removePunctuation)

 # remove numbers
 corpus.prep <- tm_map(corpus.prep, removeNumbers)

 # we will want to remove the most commonly used words such as 'a' and 'the'
 # let's get a list of those words, in English...
 head(stopwords("english"))

 # and now let's remove those 'stopwords'
 corpus.prep <- tm_map(corpus.prep, removeWords, stopwords("english"))

 # finally, stem the remaining words
 corpus.stemmed <- tm_map(corpus.prep, stemDocument)

 ############################################################
 ## the 'content()' function prints the actual text of a doc
 ## we can select specific essays using double-square brackets...
 ## for example, to extract essay 10:
 content(corpus.stemmed[[10]])

 # compare this preprocessed doc with the corresponding section of the original text:
 content(corpus.raw[[1]])

 ####### STEP 2 ----- word frequencies!!!

 # DOCUMENT-TERM MATRIX

 # how often are words (word-stems) used across all the docs
 dtm <- DocumentTermMatrix(corpus.stemmed)

 inspect(dtm[1:5, 1:8])

 dtm.mat <- as.matrix(dtm)

 ####### STEP 3 ----- visualizing the high-frequency words

 library(wordcloud)
 wordcloud(colnames(dtm.mat), dtm.mat[12, ], max.words = 20) #essay no. 12
 wordcloud(colnames(dtm.mat), dtm.mat[24, ], max.words = 20) #essay no. 24


 # If we forget what a word stem refers to then we can find out
 stemCompletion(c("revenu", "commerc",
                 "peac", "army"), corpus.prep)


 ## Here's a way of figuring out how important a word is, in a particular doc
 dtm.tfidf <- weightTfIdf(dtm) # tf-idf calculation (an importance measure)

 dtm.tfidf.mat <- as.matrix(dtm.tfidf) #convert to matrix

 ## 10 most important words for paper no. 12
 head(sort(dtm.tfidf.mat[12, ], decreasing = TRUE), n = 10)

 ## 10 most important words for paper no. 24
 head(sort(dtm.tfidf.mat[24, ], decreasing = TRUE), n = 10)


 ######################## STEP 4: Authorship Prediction
 ## Authorship prediction
 ## authorship of some Federalist Papers is unknown
 ## We use the 66 essays attributed to either Hamilton or Madison to 
 ## predict the authorship of the 11 disputed papers.

 ## Since each paper deals with a different topic, we focus on usage of articles, 
 ## prepositions, and conjuctions. We analyze the frequency of the following
 ## 10 words: although, always, commonly, consequently, considerable, enough, there, upon, while, 
 ## and whilst.

 ## We selected these words based upon an academic paper that inspired this exercise.

 ## As a result, we must the unstemmed corpus, corpus prep.

 ## We first compute the the term frequency (per 1000 words) separately for each term
 ## and document and then subset the resulting term-frequency matrix to
 ## contain only these words.

 dtm1 <- as.matrix(DocumentTermMatrix(corpus.prep))
 tfrequency <- dtm1 / rowSums(dtm1)  # term frequency
 tfm <- tfrequency*1000 # term freqency times 1000 (i.e., per 1000 words)

 ## words of interest
 words <- c("although", "always", "commonly", "consequently", 
           "considerable", "enough", "there", "upon", "while", 
           "whilst")

 ## select only these words
 #tfm <- tfm[, colnames(tfm) == words]
 tfm <- tfm[, which(is.element(colnames(tfm), words))]

 ## We can then calculate the average term frequency separately for 
 ## Hamilton and Madison across each author's entire body of documents.
 hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85)
 madison <- c(10, 14, 37:48, 58) # 

 ## average among Hamilton/Madison essays
 tfm.ave <- rbind(colSums(tfm[hamilton, ]) / length(hamilton),
                 colSums(tfm[madison, ]) / length(madison))

 tfm.ave

 ########

 ######################## STEP 4b: Running regressions -- Authorship Prediction

 ## We're going to be running regressions...
 ## If a predicted value is positive, we're going to say it's a prediction for hamilton authorship.
 ## If a predicted value is negative, we're going to say it's a prediction for madison authorship.

 author <- rep(NA, nrow(dtm1)) # a vector with a missing value
 author[hamilton] <- 1 # 1 if Hamilton
 author[madison] <- -1 # -1 if Madison

 ## data frame for regression
 author.data <- data.frame(author = author[c(hamilton, madison)],
                          tfm[c(hamilton, madison), ])

 head(author.data)

 ## To predict the authorship, we use the term frequency of the 4 words 
 ## selected based on our preliminary aanalysis ('upon', 'there', 
 ## 'consequently', and 'whilst'. The data frame object we created above contains
 ## the term frequency of the 10 words including these 4. We estimate the 
 ## coefficients using the 66 essays with known authorship.

 hm.fit <- lm(author ~ enough + upon + although + whilst + always + commonly
                      + consequently + considerable, data = author.data)
 hm.fit
 confint(hm.fit)

 hm.fitted <- fitted(hm.fit) # fitted values
 sd (hm.fitted)

 cat("\nHamilton misclassified: ", hm.fitted[author.data$author == 1] < 0, "\n")
 cat("\nMadison misclassified: ", hm.fitted[author.data$author == -1] > 0, "\n")


 ############# PART 4c: TA-DA! The disputed essays!
 dispute <- c(49, 50:57, 62, 63) # 11 essays with disputed authorship

 ## remember -- we have the term frequencies across ALL texts (including these)
 ## let's grab the tfm data in these disputed texts, & put it all in a dataframe
 tf.disputed <- as.data.frame(tfm[disputed,])

 # now we can use tgis data to run a regression and obtained predicted values
 # and predict the authors of the disputed texts.

 pred <- predict(hm.fit, newdata = tf.disputed)

 ## proportion of correctly classified essays by Hamilton
 mean(hm.fitted[author.data$author == 1] > 0)

 ## proportion of correctly classified essays by Madison
 mean(hm.fitted[author.data$author == -1] < 0)

 n <- nrow(author.data)
 hm.classify <- rep(NA, n) # a container vector with missing values 

 for (i in 1:n) {
  ## fit the model to the data after removing the ith observation
  sub.fit <- lm(author ~ ., 
                data = author.data[-i, ]) # exclude ith row
  ## predict the authorship for the ith observation
  hm.classify[i] <- predict(sub.fit, newdata = author.data[i, ])
 }

 ## fitted values for essays authored by Hamilton; red squares
 plot(hamilton, hm.fitted[author.data$author == 1], pch = 15, 
     xlim = c(1, 85), ylim  = c(-2, 2), col = "red", 
     xlab = "Federalist Papers", ylab = "Predicted values")
 abline(h = 0, lty = "dashed")

 ## essays authored by Madison; blue circles
 points(madison, hm.fitted[author.data$author == -1], 
       pch = 16, col = "blue")

 ## disputed authorship; black triangles
 points(disputed, pred, pch = 17)

	### This exercise requires installing a bunch of packages---
	### Unfortunately, the precise sequence and rules for installing may vary
	### depending upon your computer and configuration.

	## ***Taken from Chapter 5 in Kosuke Imai's "Quantitative Social Science"
	## Transcribed by Alexis Diamond, all errors my own...

	##########################################################################

	## First install "tm" and "Snowball", and load the libraries

	install.packages("tm") ### this will prompt you for "Yes, no, cancel"
	### try "Yes:. If that fails, then retry with "no".
	library(tm)

	install.packages("SnowballC")
	library(SnowballC)

	# once you have the above installed, you can proceed to the steps below:

	#install.packages("devtools") # if you have not installed devtools already
	#devtools::install_github("kosukeimai/qss-package", build_vignettes = TRUE)

	#install.packages("qss")
	#library(qss)

	###
	####### STEP 1
	# load the text (we call a bunch of text a corpus)
	# federalist_dir <- system.file("extdata", "federalist", package = "qss")

	# see all the files
	#dir(federalist_dir)
	#corpus.raw <- Corpus(DirSource(federalist_dir))

	corpus.raw <- Corpus(DirSource(directory = "federalist", pattern = "fp"))

	# this corpus comes with many different text files built in
	# to see text, use "content()" and specify which doc (e.g., the 1st one)
	content(corpus.raw[[1]])

	####### GET THE DATA IN SHAPE
	# make lower case
	corpus.prep <- tm_map(corpus.raw, content_transformer(tolower))

	# remove white space
	corpus.prep <- tm_map(corpus.prep, stripWhitespace)

	# remove punctuation
	corpus.prep <- tm_map(corpus.prep, removePunctuation)

	# remove numbers
	corpus.prep <- tm_map(corpus.prep, removeNumbers)

	# we will want to remove the most commonly used words such as 'a' and 'the'
	# let's get a list of those words, in English...
	head(stopwords("english"))

	# and now let's remove those 'stopwords'
	corpus.prep <- tm_map(corpus.prep, removeWords, stopwords("english"))

	# finally, stem the remaining words
	corpus.stemmed <- tm_map(corpus.prep, stemDocument)

	############################################################
	## the 'content()' function prints the actual text of a doc
	## we can select specific essays using double-square brackets...
	## for example, to extract essay 10:
	content(corpus.stemmed[[10]])

	# compare this preprocessed doc with the corresponding section of the original text:
	content(corpus.raw[[1]])

	####### STEP 2 ----- word frequencies!!!

	# DOCUMENT-TERM MATRIX

	# how often are words (word-stems) used across all the docs
	dtm <- DocumentTermMatrix(corpus.stemmed)

	inspect(dtm[1:5, 1:8])

	dtm.mat <- as.matrix(dtm)

	####### STEP 3 ----- visualizing the high-frequency words

	library(wordcloud)
	wordcloud(colnames(dtm.mat), dtm.mat[12, ], max.words = 20) #essay no. 12
	wordcloud(colnames(dtm.mat), dtm.mat[24, ], max.words = 20) #essay no. 24


	# If we forget what a word stem refers to then we can find out
	stemCompletion(c("revenu", "commerc",
	"peac", "army"), corpus.prep)


	## Here's a way of figuring out how important a word is, in a particular doc
	dtm.tfidf <- weightTfIdf(dtm) # tf-idf calculation (an importance measure)

	dtm.tfidf.mat <- as.matrix(dtm.tfidf) #convert to matrix

	## 10 most important words for paper no. 12
	head(sort(dtm.tfidf.mat[12, ], decreasing = TRUE), n = 10)

	## 10 most important words for paper no. 24
	head(sort(dtm.tfidf.mat[24, ], decreasing = TRUE), n = 10)


	######################## STEP 4: Authorship Prediction
	## Authorship prediction
	## authorship of some Federalist Papers is unknown
	## We use the 66 essays attributed to either Hamilton or Madison to
	## predict the authorship of the 11 disputed papers.

	## Since each paper deals with a different topic, we focus on usage of articles,
	## prepositions, and conjuctions. We analyze the frequency of the following
	## 10 words: although, always, commonly, consequently, considerable, enough, there, upon, while,
	## and whilst.

	## We selected these words based upon an academic paper that inspired this exercise.

	## As a result, we must the unstemmed corpus, corpus prep.

	## We first compute the the term frequency (per 1000 words) separately for each term
	## and document and then subset the resulting term-frequency matrix to
	## contain only these words.

	dtm1 <- as.matrix(DocumentTermMatrix(corpus.prep))
	tfrequency <- dtm1 / rowSums(dtm1) # term frequency
	tfm <- tfrequency*1000 # term freqency times 1000 (i.e., per 1000 words)

	## words of interest
	words <- c("although", "always", "commonly", "consequently",
	"considerable", "enough", "there", "upon", "while",
	"whilst")

	## select only these words
	#tfm <- tfm[, colnames(tfm) == words]
	tfm <- tfm[, which(is.element(colnames(tfm), words))]

	## We can then calculate the average term frequency separately for
	## Hamilton and Madison across each author's entire body of documents.
	hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85)
	madison <- c(10, 14, 37:48, 58) #

	## average among Hamilton/Madison essays
	tfm.ave <- rbind(colSums(tfm[hamilton, ]) / length(hamilton),
	colSums(tfm[madison, ]) / length(madison))

	tfm.ave

	########

	######################## STEP 4b: Running regressions -- Authorship Prediction

	## We're going to be running regressions...
	## If a predicted value is positive, we're going to say it's a prediction for hamilton authorship.
	## If a predicted value is negative, we're going to say it's a prediction for madison authorship.

	author <- rep(NA, nrow(dtm1)) # a vector with a missing value
	author[hamilton] <- 1 # 1 if Hamilton
	author[madison] <- -1 # -1 if Madison

	## data frame for regression
	author.data <- data.frame(author = author[c(hamilton, madison)],
	tfm[c(hamilton, madison), ])

	head(author.data)

	## To predict the authorship, we use the term frequency of the 4 words
	## selected based on our preliminary aanalysis ('upon', 'there',
	## 'consequently', and 'whilst'. The data frame object we created above contains
	## the term frequency of the 10 words including these 4. We estimate the
	## coefficients using the 66 essays with known authorship.

	hm.fit <- lm(author ~ enough + upon + although + whilst + always + commonly
	+ consequently + considerable, data = author.data)
	hm.fit
	confint(hm.fit)

	hm.fitted <- fitted(hm.fit) # fitted values
	sd (hm.fitted)

	cat("\nHamilton misclassified: ", hm.fitted[author.data$author == 1] < 0, "\n")
	cat("\nMadison misclassified: ", hm.fitted[author.data$author == -1] > 0, "\n")


	############# PART 4c: TA-DA! The disputed essays!
	dispute <- c(49, 50:57, 62, 63) # 11 essays with disputed authorship

	## remember -- we have the term frequencies across ALL texts (including these)
	## let's grab the tfm data in these disputed texts, & put it all in a dataframe
	tf.disputed <- as.data.frame(tfm[disputed,])

	# now we can use tgis data to run a regression and obtained predicted values
	# and predict the authors of the disputed texts.

	pred <- predict(hm.fit, newdata = tf.disputed)

	## proportion of correctly classified essays by Hamilton
	mean(hm.fitted[author.data$author == 1] > 0)

	## proportion of correctly classified essays by Madison
	mean(hm.fitted[author.data$author == -1] < 0)

	n <- nrow(author.data)
	hm.classify <- rep(NA, n) # a container vector with missing values

	for (i in 1:n) {
	## fit the model to the data after removing the ith observation
	sub.fit <- lm(author ~ .,
	data = author.data[-i, ]) # exclude ith row
	## predict the authorship for the ith observation
	hm.classify[i] <- predict(sub.fit, newdata = author.data[i, ])
	}

	## fitted values for essays authored by Hamilton; red squares
	plot(hamilton, hm.fitted[author.data$author == 1], pch = 15,
	xlim = c(1, 85), ylim = c(-2, 2), col = "red",
	xlab = "Federalist Papers", ylab = "Predicted values")
	abline(h = 0, lty = "dashed")

	## essays authored by Madison; blue circles
	points(madison, hm.fitted[author.data$author == -1],
	pch = 16, col = "blue")

	## disputed authorship; black triangles
	points(disputed, pred, pch = 17)
No results found