Created
February 3, 2023 16:02
-
-
Save diamonaj/431770e5fa3663b06663a9dbd02b1c34 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ### This exercise requires installing a bunch of packages--- | |
| ### Unfortunately, the precise sequence and rules for installing may vary | |
| ### depending upon your computer and configuration. | |
| ## ***Taken from Chapter 5 in Kosuke Imai's "Quantitative Social Science" | |
| ## Transcribed by Alexis Diamond, all errors my own... | |
| ########################################################################## | |
| ## First install "tm" and "Snowball", and load the libraries | |
| install.packages("tm") ### this will prompt you for "Yes, no, cancel" | |
| ### try "Yes:. If that fails, then retry with "no". | |
| library(tm) | |
| install.packages("SnowballC") | |
| library(SnowballC) | |
| # once you have the above installed, you can proceed to the steps below: | |
| #install.packages("devtools") # if you have not installed devtools already | |
| #devtools::install_github("kosukeimai/qss-package", build_vignettes = TRUE) | |
| #install.packages("qss") | |
| #library(qss) | |
| ### | |
| ####### STEP 1 | |
| # load the text (we call a bunch of text a corpus) | |
| # federalist_dir <- system.file("extdata", "federalist", package = "qss") | |
| # see all the files | |
| #dir(federalist_dir) | |
| #corpus.raw <- Corpus(DirSource(federalist_dir)) | |
| corpus.raw <- Corpus(DirSource(directory = "federalist", pattern = "fp")) | |
| # this corpus comes with many different text files built in | |
| # to see text, use "content()" and specify which doc (e.g., the 1st one) | |
| content(corpus.raw[[1]]) | |
| ####### GET THE DATA IN SHAPE | |
| # make lower case | |
| corpus.prep <- tm_map(corpus.raw, content_transformer(tolower)) | |
| # remove white space | |
| corpus.prep <- tm_map(corpus.prep, stripWhitespace) | |
| # remove punctuation | |
| corpus.prep <- tm_map(corpus.prep, removePunctuation) | |
| # remove numbers | |
| corpus.prep <- tm_map(corpus.prep, removeNumbers) | |
| # we will want to remove the most commonly used words such as 'a' and 'the' | |
| # let's get a list of those words, in English... | |
| head(stopwords("english")) | |
| # and now let's remove those 'stopwords' | |
| corpus.prep <- tm_map(corpus.prep, removeWords, stopwords("english")) | |
| # finally, stem the remaining words | |
| corpus.stemmed <- tm_map(corpus.prep, stemDocument) | |
| ############################################################ | |
| ## the 'content()' function prints the actual text of a doc | |
| ## we can select specific essays using double-square brackets... | |
| ## for example, to extract essay 10: | |
| content(corpus.stemmed[[10]]) | |
| # compare this preprocessed doc with the corresponding section of the original text: | |
| content(corpus.raw[[1]]) | |
| ####### STEP 2 ----- word frequencies!!! | |
| # DOCUMENT-TERM MATRIX | |
| # how often are words (word-stems) used across all the docs | |
| dtm <- DocumentTermMatrix(corpus.stemmed) | |
| inspect(dtm[1:5, 1:8]) | |
| dtm.mat <- as.matrix(dtm) | |
| ####### STEP 3 ----- visualizing the high-frequency words | |
| library(wordcloud) | |
| wordcloud(colnames(dtm.mat), dtm.mat[12, ], max.words = 20) #essay no. 12 | |
| wordcloud(colnames(dtm.mat), dtm.mat[24, ], max.words = 20) #essay no. 24 | |
| # If we forget what a word stem refers to then we can find out | |
| stemCompletion(c("revenu", "commerc", | |
| "peac", "army"), corpus.prep) | |
| ## Here's a way of figuring out how important a word is, in a particular doc | |
| dtm.tfidf <- weightTfIdf(dtm) # tf-idf calculation (an importance measure) | |
| dtm.tfidf.mat <- as.matrix(dtm.tfidf) #convert to matrix | |
| ## 10 most important words for paper no. 12 | |
| head(sort(dtm.tfidf.mat[12, ], decreasing = TRUE), n = 10) | |
| ## 10 most important words for paper no. 24 | |
| head(sort(dtm.tfidf.mat[24, ], decreasing = TRUE), n = 10) | |
| ######################## STEP 4: Authorship Prediction | |
| ## Authorship prediction | |
| ## authorship of some Federalist Papers is unknown | |
| ## We use the 66 essays attributed to either Hamilton or Madison to | |
| ## predict the authorship of the 11 disputed papers. | |
| ## Since each paper deals with a different topic, we focus on usage of articles, | |
| ## prepositions, and conjuctions. We analyze the frequency of the following | |
| ## 10 words: although, always, commonly, consequently, considerable, enough, there, upon, while, | |
| ## and whilst. | |
| ## We selected these words based upon an academic paper that inspired this exercise. | |
| ## As a result, we must the unstemmed corpus, corpus prep. | |
| ## We first compute the the term frequency (per 1000 words) separately for each term | |
| ## and document and then subset the resulting term-frequency matrix to | |
| ## contain only these words. | |
| dtm1 <- as.matrix(DocumentTermMatrix(corpus.prep)) | |
| tfrequency <- dtm1 / rowSums(dtm1) # term frequency | |
| tfm <- tfrequency*1000 # term freqency times 1000 (i.e., per 1000 words) | |
| ## words of interest | |
| words <- c("although", "always", "commonly", "consequently", | |
| "considerable", "enough", "there", "upon", "while", | |
| "whilst") | |
| ## select only these words | |
| #tfm <- tfm[, colnames(tfm) == words] | |
| tfm <- tfm[, which(is.element(colnames(tfm), words))] | |
| ## We can then calculate the average term frequency separately for | |
| ## Hamilton and Madison across each author's entire body of documents. | |
| hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85) | |
| madison <- c(10, 14, 37:48, 58) # | |
| ## average among Hamilton/Madison essays | |
| tfm.ave <- rbind(colSums(tfm[hamilton, ]) / length(hamilton), | |
| colSums(tfm[madison, ]) / length(madison)) | |
| tfm.ave | |
| ######## | |
| ######################## STEP 4b: Running regressions -- Authorship Prediction | |
| ## We're going to be running regressions... | |
| ## If a predicted value is positive, we're going to say it's a prediction for hamilton authorship. | |
| ## If a predicted value is negative, we're going to say it's a prediction for madison authorship. | |
| author <- rep(NA, nrow(dtm1)) # a vector with a missing value | |
| author[hamilton] <- 1 # 1 if Hamilton | |
| author[madison] <- -1 # -1 if Madison | |
| ## data frame for regression | |
| author.data <- data.frame(author = author[c(hamilton, madison)], | |
| tfm[c(hamilton, madison), ]) | |
| head(author.data) | |
| ## To predict the authorship, we use the term frequency of the 4 words | |
| ## selected based on our preliminary aanalysis ('upon', 'there', | |
| ## 'consequently', and 'whilst'. The data frame object we created above contains | |
| ## the term frequency of the 10 words including these 4. We estimate the | |
| ## coefficients using the 66 essays with known authorship. | |
| hm.fit <- lm(author ~ enough + upon + although + whilst + always + commonly | |
| + consequently + considerable, data = author.data) | |
| hm.fit | |
| confint(hm.fit) | |
| hm.fitted <- fitted(hm.fit) # fitted values | |
| sd (hm.fitted) | |
| cat("\nHamilton misclassified: ", hm.fitted[author.data$author == 1] < 0, "\n") | |
| cat("\nMadison misclassified: ", hm.fitted[author.data$author == -1] > 0, "\n") | |
| ############# PART 4c: TA-DA! The disputed essays! | |
| dispute <- c(49, 50:57, 62, 63) # 11 essays with disputed authorship | |
| ## remember -- we have the term frequencies across ALL texts (including these) | |
| ## let's grab the tfm data in these disputed texts, & put it all in a dataframe | |
| tf.disputed <- as.data.frame(tfm[disputed,]) | |
| # now we can use tgis data to run a regression and obtained predicted values | |
| # and predict the authors of the disputed texts. | |
| pred <- predict(hm.fit, newdata = tf.disputed) | |
| ## proportion of correctly classified essays by Hamilton | |
| mean(hm.fitted[author.data$author == 1] > 0) | |
| ## proportion of correctly classified essays by Madison | |
| mean(hm.fitted[author.data$author == -1] < 0) | |
| n <- nrow(author.data) | |
| hm.classify <- rep(NA, n) # a container vector with missing values | |
| for (i in 1:n) { | |
| ## fit the model to the data after removing the ith observation | |
| sub.fit <- lm(author ~ ., | |
| data = author.data[-i, ]) # exclude ith row | |
| ## predict the authorship for the ith observation | |
| hm.classify[i] <- predict(sub.fit, newdata = author.data[i, ]) | |
| } | |
| ## fitted values for essays authored by Hamilton; red squares | |
| plot(hamilton, hm.fitted[author.data$author == 1], pch = 15, | |
| xlim = c(1, 85), ylim = c(-2, 2), col = "red", | |
| xlab = "Federalist Papers", ylab = "Predicted values") | |
| abline(h = 0, lty = "dashed") | |
| ## essays authored by Madison; blue circles | |
| points(madison, hm.fitted[author.data$author == -1], | |
| pch = 16, col = "blue") | |
| ## disputed authorship; black triangles | |
| points(disputed, pred, pch = 17) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment