Skip to content

Instantly share code, notes, and snippets.

@mwfrost
mwfrost / site_fetch.sh
Created May 26, 2011 20:56
wget fetch an entire site
~/Downloads/wget/wget \
--recursive \
--no-clobber \
--page-requisites \
--html-extension \
--convert-links \
--no-parent \
idlewords.com
@mwfrost
mwfrost / hit_share.r
Created July 26, 2011 13:30
Graph runs per player with share of team's hits
library(ggplot2)
library(plyr)
library(reshape)
# Batting.csv from http://baseball1.com/files/database/lahman58-csv.zip
bb <- read.csv("Batting.csv")
bb <-bb[order(bb$yearID, bb$teamID, bb$R),]
@mwfrost
mwfrost / reorder_factor_levels.r
Created August 31, 2011 01:31
reorder factor levels
x = factor(x,levels(x)[c(...reordered levels...)])
# Go to the folder where you’re keeping source code:
C:\Program Files\R\src\
# Open the git client and execute
git clone --recursive "http://github.com/hadley/stringr.git"
# In R, execute the install function with repos = NULL and using the entire path to the source directory:
pctchange <- function(v) {
# Incremental percent change
(v - c(v[1],v[-(length(v))])) / c(v[1],v[-(length(v))])
}
# weighted harmonic mean
whmean <- function(v, w){ #, na.rm=FALSE){
#v <- ifelse(na.rm==TRUE, v[!is.na(v)],v )
#w <- ifelse(na.rm==TRUE, w[!is.na(w)],w )
@mwfrost
mwfrost / cinderella.txt
Created September 10, 2011 09:17
Cinderella install log
curl -L https://github.com/atmos/cinderella/raw/master/bootstrap.sh -o - | sh % Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
102 1233 102 1233 0 0 1393 0 --:--:-- --:--:-- --:--:-- 1393
Ensuring we have the latest version of cinderella installed
A first time install takes about 45 minutes on a modern machine
You need to upgrade rubygems to 1.7.2
Cinderella installed successfully
sh: line 25: /usr/bin/cinderella: No such file or directory
Run started Fri Sep 9 22:09:17 EDT 2011
/Users/mfrost/.rvm/rubies/ruby-1.9.2-p290/bin/ruby
@mwfrost
mwfrost / MSHA.r
Created September 22, 2011 00:24
Parse the massive (by data civilian standards) dataset here: http://www.data.gov/details/4055
mdat <- read.table('Mines.TXT', header=T, sep="|", fill=T, as.is=c(1:59),quote="")
mdat_wv <- subset(mdat, STATE == 'WV' & COAL_METAL_IND == 'C')
# Example of epic data munging to pull the desired records out of a file that's too big to read all at once
# The first batch should include the header row
skip_count <- 250000
start_row <- skip_count + 1
vdat <- read.table('Violations.TXT', nrows=skip_count, header=T, sep="|", fill=T, as.is=c(1:55), quote="",comment.char = "")
@mwfrost
mwfrost / box_scores.r
Created October 24, 2011 11:33
Box Scores from MLB.com
# References:
# http://www.r-bloggers.com/mlb-baseball-pitching-matchups-downloading-pitch-fx-data-using-the-xml-package-in-r%C2%A0updatedx6/
# http://blogisticreflections.wordpress.com/2009/10/04/using-r-to-analyze-baseball-games-in-real-time/
# http://gd2.mlb.com/components/game/mlb/year_2010/month_10/day_12/miniscoreboard.xml
################################################################################
# Program Name: xml-mlb-gameday.R
@mwfrost
mwfrost / plotmatrix2.R
Created November 29, 2011 18:23 — forked from alaiacano/plotmatrix2.R
plotmatrix with aesthetics
plotmatrix2 <- function (data, mapping = aes())
{
grid <- expand.grid(x = 1:ncol(data), y = 1:ncol(data))
grid <- subset(grid, x != y)
all <- do.call("rbind", lapply(1:nrow(grid), function(i) {
xcol <- grid[i, "x"]
ycol <- grid[i, "y"]
data.frame(xvar = names(data)[ycol], yvar = names(data)[xcol],
x = data[, xcol], y = data[, ycol], data)
}))
@mwfrost
mwfrost / log_parser.r
Created May 24, 2012 12:31
Apache logs in R
require(plyr)
require(lubridate)
log <- read.table(file='httpd.combine.20120509')
# in the file I used, there was a space between the time and the time zone, creating two fields.
names(log) <- c('host', 'identity', 'user', 'time' ,'V5','request', 'status', 'bytes','referer','agent')
# Paste the two fields together
log$time <- paste(log$time, log$V5, sep=' ')
# remove the extra field
log$V5 <- NULL