Skip to content

Instantly share code, notes, and snippets.

View alexhanna's full-sized avatar

Alex Hanna alexhanna

View GitHub Profile
@alexhanna
alexhanna / rupaul.R
Last active December 14, 2015 18:48
RuPaul's drag race
library(RCurl)
library(ggplot2)
library(survival)
## plot Kaplan-Meier curve
plotKM <- function(df, metric) {
## make age categorical
df$Age[df$Age < 25] <- 1
df$Age[df$Age >= 25 & df$Age < 31] <- 2
df$Age[df$Age >= 31] <- 3
@alexhanna
alexhanna / sweaveWeb.Rnw
Created March 14, 2013 15:26
When Brett dared me to Sweave Adam's website.
\documentclass{article}
\usepackage{graphicx}
\title{Can I Sweave Adam's Website?}
\author{Alexander Hanna}
\begin{document}
\maketitle
% put R code here
@alexhanna
alexhanna / polClassify.R
Created March 21, 2013 00:43
Political classifier, largely adapted from Machine Learning for Hackers.
# File-Name: polClassify.R
# Edited: 2013-03-20
# Orig.Author: Drew Conway ([email protected])
#
# Packages Used: tm, ggplot2
#
# All source code is copyright (c) 2012, under the Simplified BSD License.
# For more information on FreeBSD see: http://www.opensource.org/licenses/bsd-license.php
@alexhanna
alexhanna / rupaulModelFit.R
Last active December 15, 2015 09:19
Model fit with residuals
t.cox2_ph <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + Lipsyncs + CompLeft +
Wins*CompLeft + Highs*CompLeft + Lows*CompLeft + Lipsyncs*CompLeft) + cluster(ID), df)
t.cox3s <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + LipsyncWithoutOut + CompLeft) + cluster(ID), df)
model.df <- data.frame(ID = integer(0), Residuals = double(0), Model = character(0))
model.list <- list(c2 = t.cox2, c2ph = t.cox2_ph, c3 = t.cox3, c3s = t.cox3s)
for (i in 1:length(model.list)) {
name <- names(model.list[i])
cMod <- model.list[[i]]
@alexhanna
alexhanna / schema.sql
Last active August 29, 2017 07:02
Creating Twitter Hive schema.
SET hive.exec.compress.output=true;
SET mapred.max.split.size=256000000;
SET mapred.output.compression.type=BLOCK;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.dynamic.partition=true;
CREATE EXTERNAL TABLE gh_raw (
id BIGINT,
created_at STRING,
@alexhanna
alexhanna / sentimentTweet.py
Created April 22, 2014 13:03
Gist for generating sentiment scores for political tweets from the gardenhose and a focused sample
from __future__ import division
import csv, logging, math, os.path
import pickle, random, re, string
import time
import numpy as np
import pandas as pd
import nltk.data
from nltk.tokenize.regexp import WordPunctTokenizer
@alexhanna
alexhanna / sentiment.R
Created April 22, 2014 13:04
Plot sentiment for candidates
#!/usr/bin/env Rscript
library(ggplot2)
library(grid)
library(lubridate)
library(scales)
# datetimeToEasternDate <- function(x) {
# ## create as UTC
# x <- as.POSIXct(x, format="%Y-%m-%d %H:%M", tz="UTC")
Spec GradCount JobCount
Sex and Gender 108 31
Education 83 9
Medical Sociology 83 15
Race, Class, and Gender 79 15
Racial and Ethnic Relations 75 43
Cultural Sociology 68 30
Crime/Delinquency 66 89
Environmental Sociology 65 34
Social Psychology 65 17
library(ggplot2)
library(grid)
## data from http://www.asanet.org/documents/research/pdfs/2013_ASA_Job_Bank_Analysis.pdf
df <- read.csv("../data/asa2013report.csv", header = TRUE)
## exclude categories with high volume in jobs but no grads
df <- df[df$GradCount > 0,]
p <- ggplot(df.p, aes(x=Margin, y=factor(variable), fill = Class, alpha = value))
p <- p + theme_bw() + geom_tile(color = NA, width = 0.005) + scale_fill_manual(values = wes.palette(2, "Royal1"), labels = c("False Positives", "True Positives"))
p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + theme(axis.text.y = element_text(size = 7)) + ylab("Feature")
ggsave(p, file = "../img/linearsvc_no-fs_top100_fp-v-tp_20140916.png", width = 16, height = 9)