Created
February 3, 2012 14:42
-
-
Save spara/1730479 to your computer and use it in GitHub Desktop.
shell foo for data munging
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # | |
| # Look at the first 5 lines of a file | |
| # | |
| $ head -n 2 campus_dropout.csv | |
| "CAMP_SPEN","CAMP_SPER","CAMP_SPED","CAMP_OVRN","CAMP_OVRR","CAMP_OVRD","CAMP_MIGN","CAMP_MIGR","CAMP_MIGD","CAMP_LEPN","CAMP_LEPR","CAMP_LEPD","CAMP_IMMN","CAMP_IMMR","CAMP_IMMD","CAMP_GFTN","CAMP_GFTR","CAMP_GFTD","CAMP_ECNN","CAMP_ECNR","CAMP_ECND","CAMP_CTEN","CAMP_CTER","CAMP_CTED","CAMP_TTLN","CAMP_TTLR","CAMP_TTLD","CAMP_BEN","CAMP_BER","CAMP_BED","CAMP_ATRN","CAMP_ATRR","CAMP_ATRD","CAMP_MUN","CAMP_MUR","CAMP_MUD","CAMP_PIN","CAMP_PIR","CAMP_PID","CAMP_WHN","CAMP_WHR","CAMP_WHD","CAMP_NAN","CAMP_NAR","CAMP_NAD","CAMP_HSN","CAMP_HSR","CAMP_HSD","CAMP_ASN","CAMP_ASR","CAMP_ASD","CAMP_AAN","CAMP_AAR","CAMP_AAD","CAMP_FEMN","CAMP_FEMR","CAMP_FEMD","CAMP_MALN","CAMP_MALR","CAMP_MALD","CAMP_ALLN","CAMP_ALLR","CAMP_ALLD","DISTRICT","Gradespan","campus","cntyname","regnname","campname","distname","COUNTY","REGION" | |
| "0","0.0","28","-","3.7","<100",".",".",".",".",".",".",".",".",".","0","0.0","24","-","1.5","<100","0","0.0","45","-","1.4","<300",".",".",".","-","4.4","<100","0","0.0","10",".",".",".","-","1.8","<200",".",".",".","0","0.0","9","0","0.0","1","0","0.0","29","-","2.3","<100","-","0.8","<200","-","1.4","<300","001902","712","001902001","Anderson County","Kilgore","Cayuga H S","Cayuga ISD","001","07" | |
| # | |
| # look at the last line of the file | |
| # | |
| $ tail -n 1 campus_dropout.csv | |
| "0","0.0","31","-","12.5","<100","0","0.0","39","0","0.0","12",".",".",".","0","0.0","20","-","1.3","<200","-","1.1","<100","-","1.5","<300","0","0.0","11","-","1.5","<200","0","0.0","1",".",".",".","0","0.0","12",".",".",".","-","2.1","<200",".",".",".",".",".",".","-","2.9","<200","-","1.0","<200","-","1.9","<300","254902","912","254902001","Zavala County","San Antonio","La Pryor H S","La Pryor ISD","254","20" | |
| # | |
| # replace the "." values with nulls using sed and write to | |
| # another file | |
| # | |
| sed 's/\"\.\"//g' campus_dropout.csv > campus_dropout_clean.csv | |
| # | |
| # check the output of the new file | |
| # | |
| $ head -n 2 campus_dropout_clean.csv | |
| "CAMP_SPEN","CAMP_SPER","CAMP_SPED","CAMP_OVRN","CAMP_OVRR","CAMP_OVRD","CAMP_MIGN","CAMP_MIGR","CAMP_MIGD","CAMP_LEPN","CAMP_LEPR","CAMP_LEPD","CAMP_IMMN","CAMP_IMMR","CAMP_IMMD","CAMP_GFTN","CAMP_GFTR","CAMP_GFTD","CAMP_ECNN","CAMP_ECNR","CAMP_ECND","CAMP_CTEN","CAMP_CTER","CAMP_CTED","CAMP_TTLN","CAMP_TTLR","CAMP_TTLD","CAMP_BEN","CAMP_BER","CAMP_BED","CAMP_ATRN","CAMP_ATRR","CAMP_ATRD","CAMP_MUN","CAMP_MUR","CAMP_MUD","CAMP_PIN","CAMP_PIR","CAMP_PID","CAMP_WHN","CAMP_WHR","CAMP_WHD","CAMP_NAN","CAMP_NAR","CAMP_NAD","CAMP_HSN","CAMP_HSR","CAMP_HSD","CAMP_ASN","CAMP_ASR","CAMP_ASD","CAMP_AAN","CAMP_AAR","CAMP_AAD","CAMP_FEMN","CAMP_FEMR","CAMP_FEMD","CAMP_MALN","CAMP_MALR","CAMP_MALD","CAMP_ALLN","CAMP_ALLR","CAMP_ALLD","DISTRICT","Gradespan","campus","cntyname","regnname","campname","distname","COUNTY","REGION" | |
| "0","0.0","28","-","3.7","<100",,,,,,,,,,"0","0.0","24","-","1.5","<100","0","0.0","45","-","1.4","<300",,,,"-","4.4","<100","0","0.0","10",,,,"-","1.8","<200",,,,"0","0.0","9","0","0.0","1","0","0.0","29","-","2.3","<100","-","0.8","<200","-","1.4","<300","001902","712","001902001","Anderson County","Kilgore","Cayuga H S","Cayuga ISD","001","07" | |
| # | |
| # create a sql script to create a table in postgres | |
| # First step is extract the header | |
| # | |
| $ head -n 1 campus_dropout.csv > campus_header | |
| # | |
| # look at header | |
| # | |
| $ cat campus_header | |
| # | |
| # put each column name on a separate line and add sql type | |
| # definition using sed | |
| # | |
| $ sed 's/,/ varchar,\ | |
| /g' campus_header > campus_dropout.sql | |
| # | |
| # look at output | |
| # | |
| $cat campus_dropout.sql | |
| # | |
| # Finish the script by adding a create table statement | |
| # using your favorite editor or on the command line. | |
| # Note that this version of sed does not have inplace | |
| # editing, so a tmp file is used | |
| # | |
| $ sed '1s/^/CREATE TABLE campust_dropout_raw( /' campus_dropout.sql > tmp | |
| $ sed '$s/$/ varchar);/' tmp > campus_dropout.sql | |
| # | |
| # Make a sql script to create a table with only the dropout rate | |
| # column using sed | |
| # | |
| $ tr -s ',' '\n' < campus_header | sed -n '2~3p' > campus_dropout_rate.sql | |
| # grab the first n items in a file | |
| $ wc -l campus_dropout_rate.sql | |
| $ head -n 21 campus_dropout_rate.sql > first | |
| # grab the last n items in a file | |
| $ tr -s ',' '\n' < campus_header | tail -n 9 > last | |
| # merge the two files | |
| cat first last > campus_dropout_rate.sql | |
| # add CREATE TABLE and closing parentheses | |
| $ sed '1s/^/CREATE TABLE campust_dropout_rate( /' campus_dropout_rate.sql > tmp | |
| $ sed '$s/$/ varchar);/' tmp > campus_dropout_rate.sql | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment