Skip to content

Instantly share code, notes, and snippets.

@mwfrost
Created September 22, 2011 00:24
Show Gist options
  • Select an option

  • Save mwfrost/1233722 to your computer and use it in GitHub Desktop.

Select an option

Save mwfrost/1233722 to your computer and use it in GitHub Desktop.
Parse the massive (by data civilian standards) dataset here: http://www.data.gov/details/4055
mdat <- read.table('Mines.TXT', header=T, sep="|", fill=T, as.is=c(1:59),quote="")
mdat_wv <- subset(mdat, STATE == 'WV' & COAL_METAL_IND == 'C')
# Example of epic data munging to pull the desired records out of a file that's too big to read all at once
# The first batch should include the header row
skip_count <- 250000
start_row <- skip_count + 1
vdat <- read.table('Violations.TXT', nrows=skip_count, header=T, sep="|", fill=T, as.is=c(1:55), quote="",comment.char = "")
vnames <- names(vdat)
vdat <- merge(mdat_wv[c("MINE_ID","CURRENT_MINE_TYPE")] , vdat )
# The second batch starts where the first left off, then picks up the names from it.
# note that the number of rows skipped includes the header
# Violations.TXT has about 1.5 million rows
while (start_row < 2000000) {
print(paste("About to scan records ", start_row, " through " , start_row + skip_count))
vdat_temp <- read.table('Violations.TXT', nrows=skip_count, header=F, sep="|", fill=T, as.is=c(1:55), skip = start_row ,quote="",comment.char = "")
names(vdat_temp) <- vnames
vdat <- rbind(vdat, merge(mdat_wv[c("MINE_ID","CURRENT_MINE_TYPE")] , vdat_temp ))
print(paste("Rows collected: " , nrow(vdat)))
print(paste("Events ", vdat[last_row,c("EVENT_NO")] , " through " , vdat[nrow(vdat),c("EVENT_NO")]))
start_row <- start_row + skip_count
last_row <- nrow(vdat)
}
write.table(vdat, "wv_vdat.rda")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment