Created
September 22, 2011 00:24
-
-
Save mwfrost/1233722 to your computer and use it in GitHub Desktop.
Parse the massive (by data civilian standards) dataset here: http://www.data.gov/details/4055
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| mdat <- read.table('Mines.TXT', header=T, sep="|", fill=T, as.is=c(1:59),quote="") | |
| mdat_wv <- subset(mdat, STATE == 'WV' & COAL_METAL_IND == 'C') | |
| # Example of epic data munging to pull the desired records out of a file that's too big to read all at once | |
| # The first batch should include the header row | |
| skip_count <- 250000 | |
| start_row <- skip_count + 1 | |
| vdat <- read.table('Violations.TXT', nrows=skip_count, header=T, sep="|", fill=T, as.is=c(1:55), quote="",comment.char = "") | |
| vnames <- names(vdat) | |
| vdat <- merge(mdat_wv[c("MINE_ID","CURRENT_MINE_TYPE")] , vdat ) | |
| # The second batch starts where the first left off, then picks up the names from it. | |
| # note that the number of rows skipped includes the header | |
| # Violations.TXT has about 1.5 million rows | |
| while (start_row < 2000000) { | |
| print(paste("About to scan records ", start_row, " through " , start_row + skip_count)) | |
| vdat_temp <- read.table('Violations.TXT', nrows=skip_count, header=F, sep="|", fill=T, as.is=c(1:55), skip = start_row ,quote="",comment.char = "") | |
| names(vdat_temp) <- vnames | |
| vdat <- rbind(vdat, merge(mdat_wv[c("MINE_ID","CURRENT_MINE_TYPE")] , vdat_temp )) | |
| print(paste("Rows collected: " , nrow(vdat))) | |
| print(paste("Events ", vdat[last_row,c("EVENT_NO")] , " through " , vdat[nrow(vdat),c("EVENT_NO")])) | |
| start_row <- start_row + skip_count | |
| last_row <- nrow(vdat) | |
| } | |
| write.table(vdat, "wv_vdat.rda") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment