Skip to content

Instantly share code, notes, and snippets.

## -----------------------------------------------------------------------------------------------------------
library(maps)
data(us.cities)
head(us.cities)
map(database = "usa")
capitals <- subset(us.cities, capital == 2) # subset state capitals
dd <- read.csv("progresa.csv")
dim(dd)
dd <- na.omit(dd)
dim(dd)
### QUESTION 1
# An example of what I'm looking for
par(mfrow = c(2,1))
install.packages("tree")
library(MASS)
library(tree)
head(Pima.tr)
#############
set.seed(1234)
# enter your code here
# fit a tree in the training set
### This exercise requires installing a bunch of packages---
### Unfortunately, the precise sequence and rules for installing may vary
### depending upon your computer and configuration.
## ***Taken from Chapter 5 in Kosuke Imai's "Quantitative Social Science"
## Transcribed by Alexis Diamond, all errors my own...
##########################################################################
# assuming you have downloaded the data (Data1.csv) correctly,
# as discussed here: https://piazza.com/class/l7oq25mqbrz1nd/post/110
# you may need to change the file location in quotes below, to suit where your file is
apple <- read.csv("~/Documents/Data1.csv", stringsAsFactors = F, encoding="UTF-8")
str(apple)
install.packages("tm")
# assuming you have downloaded the data (Data1.csv) correctly,
# as discussed here: https://piazza.com/class/l7oq25mqbrz1nd/post/110
# you may need to change the file location in quotes below, to suit where your file is
apple <- read.csv("~/Documents/Data1.csv", stringsAsFactors = F, encoding="UTF-8")
str(apple)
library(tm)
## We're going to be running regressions...
## If a predicted value is positive, we're going to say it's a prediction for hamilton authorship.
## If a predicted value is negative, we're going to say it's a prediction for madison authorship.
author <- rep(NA, nrow(dtm1)) # a vector with a missing value
author[hamilton] <- 1 # 1 if Hamilton
author[madison] <- -1 # -1 if Madison
## data frame for regression
author.data <- data.frame(author = author[c(hamilton, madison)],
## Authorship prediction
## authorship of some Federalist Papers is unknown
## We use the 66 essays attributed to either Hamilton or Madison to
## predict the authorship of the 11 disputed papers.
## Since each paper deals with a different topic, we focus on usage of articles,
## prepositions, and conjuctions. We analyze the frequency of the following
## 10 words: although, always, commonly, consequently, considerable, enough, there, upon, while,
## and whilst.
# how often are words (word-stems) used across all the docs
dtm <- DocumentTermMatrix(corpus.stemmed)
# in the first 5 text files, how frequent are the first 8 words (alphabetical order)
inspect(dtm[1:5, 1:8])
# let's make that dtm table a matrix...
dtm.mat <- as.matrix(dtm)
####### STEP 3 ----- visualizing the high-frequency words
@diamonaj
diamonaj / step1.R
Last active January 30, 2023 18:43
### In your R working directory, you should have a directory called "federalist" filled with .txt files
corpus.raw <- Corpus(DirSource(directory = "federalist", pattern = "fp"))
# this corpus comes with many different text files built in
# to see text, use "content()" and specify which doc (e.g., the 1st one)
content(corpus.raw[[1]])
####### GET THE DATA IN SHAPE
# make lower case