diamonaj · January 30, 2023 20:08
diff --git a/step3.R b/step3.R
 ## Authorship prediction
 ## authorship of some Federalist Papers is unknown
 ## We use the 66 essays attributed to either Hamilton or Madison to 
 ## predict the authorship of the 11 disputed papers.

 ## Since each paper deals with a different topic, we focus on usage of articles, 
 ## prepositions, and conjuctions. We analyze the frequency of the following
 ## 10 words: although, always, commonly, consequently, considerable, enough, there, upon, while, 
 ## and whilst.

 ## We selected these words based upon an academic paper that inspired this exercise.

 ## As a result, we must the unstemmed corpus, corpus prep.

 ## We first compute the the term frequency (per 1000 words) separately for each term
 ## and document and then subset the resulting term-frequency matrix to
 ## contain only these words.

 dtm1 <- as.matrix(DocumentTermMatrix(corpus.prep))
 tfrequency <- dtm1 / rowSums(dtm1)  # term frequency
 tfm <- tfrequency*1000 # term freqency times 1000 (i.e., per 1000 words)

 ## words of interest
 words <- c("although", "always", "commonly", "consequently", 
           "considerable", "enough", "there", "upon", "while", 
           "whilst")

 ## select only these words
 #tfm <- tfm[, colnames(tfm) == words]
 tfm <- tfm[, which(is.element(colnames(tfm), words))]

 ## We can then calculate the average term frequency separately for 
 ## Hamilton and Madison across each author's entire body of documents.
 hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85)
 madison <- c(10, 14, 37:48, 58) # 

 ## average among Hamilton/Madison essays
 tfm.ave <- rbind(colSums(tfm[hamilton, ]) / length(hamilton),
                 colSums(tfm[madison, ]) / length(madison))

 tfm.ave
	## Authorship prediction
	## authorship of some Federalist Papers is unknown
	## We use the 66 essays attributed to either Hamilton or Madison to
	## predict the authorship of the 11 disputed papers.

	## Since each paper deals with a different topic, we focus on usage of articles,
	## prepositions, and conjuctions. We analyze the frequency of the following
	## 10 words: although, always, commonly, consequently, considerable, enough, there, upon, while,
	## and whilst.

	## We selected these words based upon an academic paper that inspired this exercise.

	## As a result, we must the unstemmed corpus, corpus prep.

	## We first compute the the term frequency (per 1000 words) separately for each term
	## and document and then subset the resulting term-frequency matrix to
	## contain only these words.

	dtm1 <- as.matrix(DocumentTermMatrix(corpus.prep))
	tfrequency <- dtm1 / rowSums(dtm1) # term frequency
	tfm <- tfrequency*1000 # term freqency times 1000 (i.e., per 1000 words)

	## words of interest
	words <- c("although", "always", "commonly", "consequently",
	"considerable", "enough", "there", "upon", "while",
	"whilst")

	## select only these words
	#tfm <- tfm[, colnames(tfm) == words]
	tfm <- tfm[, which(is.element(colnames(tfm), words))]

	## We can then calculate the average term frequency separately for
	## Hamilton and Madison across each author's entire body of documents.
	hamilton <- c(1, 6:9, 11:13, 15:17, 21:36, 59:61, 65:85)
	madison <- c(10, 14, 37:48, 58) #

	## average among Hamilton/Madison essays
	tfm.ave <- rbind(colSums(tfm[hamilton, ]) / length(hamilton),
	colSums(tfm[madison, ]) / length(madison))

	tfm.ave
No results found