Last active
May 2, 2016 17:54
-
-
Save primaryobjects/c4c8df13c887004be5b44e5cc984d749 to your computer and use it in GitHub Desktop.
Predicting presidential election winners by polling data and logistic regression.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library('mice') | |
| polling <- read.csv('PollingData.csv') | |
| table(polling$Year) | |
| # Note missing values in Rasmussen and SurveyUSA. | |
| summary(polling) | |
| simple <- polling[c('Rasmussen', 'SurveyUSA', 'PropR', 'DiffCount')] | |
| set.seed(144) | |
| # Fill in missing values by simulating similar values. | |
| imputed <- complete(mice(simple)) | |
| polling$Rasmussen <- imputed$Rasmussen | |
| polling$SurveyUSA <- imputed$SurveyUSA | |
| train <- subset(polling, Year == 2004 | Year == 2008) | |
| test <- subset(polling, Year == 2012) | |
| # 0 1 | |
| # 47 53 | |
| # Shows results for 53 republication wins. A baseline prediction can assume a republication always wins or 53%. | |
| table(train$Republican) | |
| # Baseline accuracy: 0.53 | |
| 53 / (47 + 53) | |
| # Smart baseline will pick winner as higher Rasmussen variable. If repub higher, pick repub. | |
| table(sign(train$Rasmussen)) | |
| # -1 0 1 | |
| # 42 3 55 | |
| # So this shows the smart baseline predicts in 55 cases the republican would win, 42 the democrat, and 3 cases inconclusive. | |
| # See how smart baseline compares against actual result of win. | |
| table(train$Republican, sign(train$Rasmussen)) | |
| # -1 0 1 | |
| # 0 42 2 3 | |
| # 1 0 1 52 | |
| # So this shows 42 observations where Rasmussen predicts a democrat would win and the democrat actually did win. | |
| # There were 52 observations where Rasmussen predicts a republication would win and the republication actually did win. | |
| # There were 3 times where the smart baseline predicted a republican would win, but a democrat won. | |
| # Smart baseline accuracy: 0.94 | |
| (52 + 42) / (52 + 42 + 3 + 3) | |
| # Check correlations to find most highly correlated feature with Republican. | |
| # Looking at the Republican row, we see PropR is highly correlated at 0.948, so it should do a good job at predicting Republican. | |
| cor(train[c('Rasmussen', 'SurveyUSA', 'PropR', 'DiffCount', 'Republican')]) | |
| mod1 <- glm(Republican ~ PropR, data=train, family=binomial) | |
| pred1 <- predict(mod1, type='response') | |
| table(train$Republican, pred1 >= 0.5) | |
| # FALSE TRUE | |
| # 0 45 2 | |
| # 1 2 51 | |
| # We see 45 true negatives and 51 true positives. | |
| # Accuracy: 0.96 | |
| (45 + 51) / (45 + 2 + 2 + 51) | |
| # Next, find least correlated variables and try those if they help predict winner. | |
| # Note model is missing statistical significance, only a '.' on DiffCount. | |
| mod2 <- glm(Republican ~ SurveyUSA + DiffCount, data=train, family=binomial) | |
| pred2 <- predict(mod2, type='response') | |
| table(train$Republican, pred2 >= 0.5) | |
| # FALSE TRUE | |
| # 0 45 2 | |
| # 1 1 52 | |
| # We see 45 true negatives and 51 true positives. | |
| # Accuracy: 0.97 | |
| (45 + 52) / (45 + 1 + 2 + 52) | |
| # Try smart baseline on the test set (ie. just predict winner by Rasmussen value). | |
| table(test$Republican, sign(test$Rasmussen)) | |
| # -1 0 1 | |
| # 0 18 2 4 | |
| # 1 0 0 21 | |
| # Accuracy: 0.867 | |
| (18 + 21) / (18 + 2 + 4 + 21) | |
| testpred <- predict(mod2, newdata=test, type='response') | |
| table(test$Republican, testpred >= 0.5) | |
| # FALSE TRUE | |
| # 0 23 1 | |
| # 1 0 21 | |
| # Accuracy: 0.978 | |
| (23 + 21) / (23 + 1 + 21) | |
| # Find record where our 1 case failed. | |
| subset(test, testpred >= 0.5 & Republican == 0) | |
| # State Year Rasmussen SurveyUSA DiffCount PropR Republican | |
| # 24 Florida 2012 2 0 6 0.6666667 0 | |
| # Rasmussen predicted republican. | |
| # SurveyUSA called a tie. | |
| # DiffCount said 6 more for republican. | |
| # Two thirds (0.66) predicted a republican would win. | |
| # But a republican did not win (Republican = 0). | |
| # Thus, reasonable for mistake on Florida record. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| State | Year | Rasmussen | SurveyUSA | DiffCount | PropR | Republican | |
|---|---|---|---|---|---|---|---|
| Alabama | 2004 | 11 | 18 | 5 | 1 | 1 | |
| Alabama | 2008 | 21 | 25 | 5 | 1 | 1 | |
| Alaska | 2004 | 1 | 1 | 1 | |||
| Alaska | 2008 | 16 | 6 | 1 | 1 | ||
| Arizona | 2004 | 5 | 15 | 8 | 1 | 1 | |
| Arizona | 2008 | 5 | 9 | 1 | 1 | ||
| Arizona | 2012 | 8 | 4 | 0.833333333 | 1 | ||
| Arkansas | 2004 | 7 | 5 | 8 | 1 | 1 | |
| Arkansas | 2008 | 10 | 5 | 1 | 1 | ||
| Arkansas | 2012 | 2 | 1 | 1 | |||
| California | 2004 | -11 | -11 | -8 | 0 | 0 | |
| California | 2008 | -27 | -24 | -5 | 0 | 0 | |
| California | 2012 | -14 | -6 | 0 | 0 | ||
| Colorado | 2004 | 5 | 3 | 9 | 1 | 1 | |
| Colorado | 2008 | -4 | -15 | 0 | 0 | ||
| Colorado | 2012 | 3 | -2 | -5 | 0.307692308 | 0 | |
| Connecticut | 2004 | -3 | 0 | 0 | |||
| Connecticut | 2008 | -17 | -16 | -4 | 0 | 0 | |
| Connecticut | 2012 | -7 | -13 | -8 | 0 | 0 | |
| Delaware | 2004 | -2 | 0 | 0 | |||
| Delaware | 2008 | -15 | -30 | -4 | 0 | 0 | |
| Florida | 2004 | 3 | 1 | 0 | 0.5 | 1 | |
| Florida | 2008 | 1 | -3 | -13 | 0.157894737 | 0 | |
| Florida | 2012 | 2 | 0 | 6 | 0.666666667 | 0 | |
| Georgia | 2004 | 12 | 4 | 1 | 1 | ||
| Georgia | 2008 | 5 | 7 | 9 | 1 | 1 | |
| Georgia | 2012 | 8 | 4 | 1 | 1 | ||
| Hawaii | 2004 | 2 | 0.75 | 0 | |||
| Hawaii | 2008 | -41 | -1 | 0 | 0 | ||
| Hawaii | 2012 | -2 | 0 | 0 | |||
| Idaho | 2004 | 1 | 1 | 1 | |||
| Idaho | 2008 | 39 | 1 | 1 | 1 | ||
| Idaho | 2012 | 1 | 1 | 1 | |||
| Illinois | 2004 | -11 | -12 | -5 | 0 | 0 | |
| Illinois | 2008 | -22 | -5 | 0 | 0 | ||
| Illinois | 2012 | -5 | 0 | 0 | |||
| Indiana | 2004 | 19 | 3 | 1 | 1 | ||
| Indiana | 2008 | 3 | 0 | 2 | 0.625 | 0 | |
| Indiana | 2012 | 9 | 3 | 1 | 1 | ||
| Iowa | 2004 | 2 | -3 | 5 | 0.666666667 | 1 | |
| Iowa | 2008 | -8 | -15 | -8 | 0 | 0 | |
| Iowa | 2012 | 1 | -2 | 0.4 | 0 | ||
| Kansas | 2004 | 23 | 3 | 1 | 1 | ||
| Kansas | 2008 | 13 | 21 | 2 | 1 | 1 | |
| Kansas | 2012 | 9 | 1 | 1 | 1 | ||
| Kentucky | 2004 | 21 | 3 | 1 | 1 | ||
| Kentucky | 2008 | 12 | 16 | 5 | 1 | 1 | |
| Kentucky | 2012 | 14 | 1 | 1 | 1 | ||
| Louisiana | 2004 | 5 | 1 | 1 | |||
| Louisiana | 2008 | 16 | 2 | 1 | 1 | ||
| Louisiana | 2012 | 2 | 1 | 1 | |||
| Maine | 2004 | -8 | -6 | 0 | 0 | ||
| Maine | 2008 | -13 | -15 | -6 | 0 | 0 | |
| Maine | 2012 | -12 | -6 | 0 | 0 | ||
| Maryland | 2004 | -3 | -11 | -6 | 0 | 0 | |
| Maryland | 2008 | -23 | -1 | 0 | 0 | ||
| Maryland | 2012 | -4 | 0 | 0 | |||
| Massachusetts | 2004 | -2 | 0 | 0 | |||
| Massachusetts | 2008 | -28 | -17 | -4 | 0 | 0 | |
| Massachusetts | 2012 | -19 | -8 | 0 | 0 | ||
| Michigan | 2004 | -2 | 0 | 0 | |||
| Michigan | 2008 | -10 | -11 | 0.133333333 | 0 | ||
| Michigan | 2012 | -5 | -10 | 0.083333333 | 0 | ||
| Minnesota | 2004 | -1 | -7 | 0.181818182 | 0 | ||
| Minnesota | 2008 | -12 | -3 | -14 | 0 | 0 | |
| Minnesota | 2012 | -5 | -11 | -5 | 0.142857143 | 0 | |
| Mississippi | 2004 | 1 | 1 | 1 | |||
| Mississippi | 2008 | 8 | 4 | 1 | 1 | ||
| Mississippi | 2012 | 1 | 1 | 1 | |||
| Missouri | 2004 | 5 | 5 | 8 | 1 | 1 | |
| Missouri | 2008 | 0 | 0 | 4 | 0.833333333 | 1 | |
| Missouri | 2012 | 11 | 7 | 8 | 1 | 1 | |
| Montana | 2004 | 3 | 1 | 1 | |||
| Montana | 2008 | 4 | 4 | 0.75 | 1 | ||
| Montana | 2012 | 10 | 5 | 1 | 1 | ||
| Nebraska | 2004 | 2 | 1 | 1 | |||
| Nebraska | 2008 | 19 | 1 | 1 | 1 | ||
| Nebraska | 2012 | 2 | 1 | 1 | |||
| Nevada | 2004 | 2 | 8 | 9 | 1 | 1 | |
| Nevada | 2008 | -4 | -9 | 0.090909091 | 0 | ||
| Nevada | 2012 | -2 | -4 | -10 | 0 | 0 | |
| New Hampshire | 2004 | -2 | -5 | 0.222222222 | 0 | ||
| New Hampshire | 2008 | -7 | -11 | -14 | 0 | 0 | |
| New Hampshire | 2012 | -2 | -8 | 0 | 0 | ||
| New Jersey | 2004 | -12 | -12 | -8 | 0 | 0 | |
| New Jersey | 2008 | -15 | -10 | -9 | 0 | 0 | |
| New Jersey | 2012 | -14 | -9 | 0 | 0 | ||
| New Mexico | 2004 | 4 | 2 | 0.625 | 1 | ||
| New Mexico | 2008 | -10 | -7 | -6 | 0 | 0 | |
| New Mexico | 2012 | -11 | -5 | 0 | 0 | ||
| New York | 2004 | -5 | -18 | -6 | 0 | 0 | |
| New York | 2008 | -20 | -33 | -5 | 0 | 0 | |
| New York | 2012 | -29 | -5 | 0 | 0 | ||
| North Carolina | 2004 | 12 | 8 | 7 | 1 | 1 | |
| North Carolina | 2008 | 1 | 1 | -5 | 0.333333333 | 0 | |
| North Carolina | 2012 | 6 | 5 | 3 | 0.666666667 | 1 | |
| North Dakota | 2004 | 2 | 1 | 1 | |||
| North Dakota | 2008 | 14 | 0 | 0.5 | 1 | ||
| North Dakota | 2012 | 14 | 4 | 1 | 1 | ||
| Ohio | 2004 | 4 | 2 | 3 | 0.6 | 1 | |
| Ohio | 2008 | 0 | -2 | -16 | 0.1 | 0 | |
| Ohio | 2012 | 0 | -5 | -16 | 0 | 0 | |
| Oklahoma | 2004 | 34 | 30 | 4 | 1 | 1 | |
| Oklahoma | 2008 | 31 | 24 | 2 | 1 | 1 | |
| Oklahoma | 2012 | 1 | 1 | 1 | |||
| Oregon | 2004 | -8 | -3 | -8 | 0.1 | 0 | |
| Oregon | 2008 | -12 | -19 | -9 | 0 | 0 | |
| Oregon | 2012 | -7 | -4 | 0 | 0 | ||
| Pennsylvania | 2004 | -2 | -1 | -12 | 0.125 | 0 | |
| Pennsylvania | 2008 | -6 | -9 | -19 | 0 | 0 | |
| Pennsylvania | 2012 | -5 | 0 | -13 | 0 | 0 | |
| Rhode Island | 2004 | -13 | -2 | 0 | 0 | ||
| Rhode Island | 2008 | -19 | -1 | 0 | 0 | ||
| Rhode Island | 2012 | -2 | 0 | 0 | |||
| South Carolina | 2004 | 18 | 4 | 1 | 1 | ||
| South Carolina | 2008 | 11 | 8 | 5 | 1 | 1 | |
| South Carolina | 2012 | 1 | 1 | 1 | |||
| South Dakota | 2004 | 10 | 4 | 1 | 1 | ||
| South Dakota | 2008 | 9 | 4 | 1 | 1 | ||
| South Dakota | 2012 | 1 | 1 | 1 | |||
| Tennessee | 2004 | 6 | 18 | 7 | 1 | 1 | |
| Tennessee | 2008 | 12 | 5 | 1 | 1 | ||
| Tennessee | 2012 | 1 | 1 | 1 | |||
| Texas | 2004 | 22 | 2 | 1 | 1 | ||
| Texas | 2008 | 10 | 5 | 1 | 1 | ||
| Texas | 2012 | 4 | 1 | 1 | |||
| Utah | 2004 | 3 | 1 | 1 | |||
| Utah | 2008 | 32 | 3 | 1 | 1 | ||
| Utah | 2012 | 1 | 1 | 1 | |||
| Vermont | 2004 | -2 | 0 | 0 | |||
| Vermont | 2008 | -24 | -2 | 0 | 0 | ||
| Virginia | 2004 | 6 | 4 | 5 | 1 | 1 | |
| Virginia | 2008 | -4 | -4 | -18 | 0 | 0 | |
| Virginia | 2012 | 2 | -4 | 0.333333333 | 0 | ||
| Washington | 2004 | -8 | -4 | -10 | 0 | 0 | |
| Washington | 2008 | -11 | -16 | -6 | 0 | 0 | |
| Washington | 2012 | -13 | -14 | -8 | 0 | 0 | |
| West Virginia | 2004 | 6 | 6 | 1 | 1 | ||
| West Virginia | 2008 | 9 | 11 | 1 | 1 | ||
| West Virginia | 2012 | 1 | 1 | 1 | |||
| Wisconsin | 2004 | -1 | 1 | 0.533333333 | 0 | ||
| Wisconsin | 2008 | -7 | -16 | -12 | 0 | 0 | |
| Wisconsin | 2012 | 0 | -8 | 0 | 0 | ||
| Wyoming | 2004 | 1 | 1 | 1 | |||
| Wyoming | 2008 | 19 | 21 | 3 | 1 | 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment