Skip to content

Instantly share code, notes, and snippets.

@spara
Created February 3, 2012 14:42
Show Gist options
  • Select an option

  • Save spara/1730479 to your computer and use it in GitHub Desktop.

Select an option

Save spara/1730479 to your computer and use it in GitHub Desktop.
shell foo for data munging
#
# Look at the first 5 lines of a file
#
$ head -n 2 campus_dropout.csv
"CAMP_SPEN","CAMP_SPER","CAMP_SPED","CAMP_OVRN","CAMP_OVRR","CAMP_OVRD","CAMP_MIGN","CAMP_MIGR","CAMP_MIGD","CAMP_LEPN","CAMP_LEPR","CAMP_LEPD","CAMP_IMMN","CAMP_IMMR","CAMP_IMMD","CAMP_GFTN","CAMP_GFTR","CAMP_GFTD","CAMP_ECNN","CAMP_ECNR","CAMP_ECND","CAMP_CTEN","CAMP_CTER","CAMP_CTED","CAMP_TTLN","CAMP_TTLR","CAMP_TTLD","CAMP_BEN","CAMP_BER","CAMP_BED","CAMP_ATRN","CAMP_ATRR","CAMP_ATRD","CAMP_MUN","CAMP_MUR","CAMP_MUD","CAMP_PIN","CAMP_PIR","CAMP_PID","CAMP_WHN","CAMP_WHR","CAMP_WHD","CAMP_NAN","CAMP_NAR","CAMP_NAD","CAMP_HSN","CAMP_HSR","CAMP_HSD","CAMP_ASN","CAMP_ASR","CAMP_ASD","CAMP_AAN","CAMP_AAR","CAMP_AAD","CAMP_FEMN","CAMP_FEMR","CAMP_FEMD","CAMP_MALN","CAMP_MALR","CAMP_MALD","CAMP_ALLN","CAMP_ALLR","CAMP_ALLD","DISTRICT","Gradespan","campus","cntyname","regnname","campname","distname","COUNTY","REGION"
"0","0.0","28","-","3.7","<100",".",".",".",".",".",".",".",".",".","0","0.0","24","-","1.5","<100","0","0.0","45","-","1.4","<300",".",".",".","-","4.4","<100","0","0.0","10",".",".",".","-","1.8","<200",".",".",".","0","0.0","9","0","0.0","1","0","0.0","29","-","2.3","<100","-","0.8","<200","-","1.4","<300","001902","712","001902001","Anderson County","Kilgore","Cayuga H S","Cayuga ISD","001","07"
#
# look at the last line of the file
#
$ tail -n 1 campus_dropout.csv
"0","0.0","31","-","12.5","<100","0","0.0","39","0","0.0","12",".",".",".","0","0.0","20","-","1.3","<200","-","1.1","<100","-","1.5","<300","0","0.0","11","-","1.5","<200","0","0.0","1",".",".",".","0","0.0","12",".",".",".","-","2.1","<200",".",".",".",".",".",".","-","2.9","<200","-","1.0","<200","-","1.9","<300","254902","912","254902001","Zavala County","San Antonio","La Pryor H S","La Pryor ISD","254","20"
#
# replace the "." values with nulls using sed and write to
# another file
#
sed 's/\"\.\"//g' campus_dropout.csv > campus_dropout_clean.csv
#
# check the output of the new file
#
$ head -n 2 campus_dropout_clean.csv
"CAMP_SPEN","CAMP_SPER","CAMP_SPED","CAMP_OVRN","CAMP_OVRR","CAMP_OVRD","CAMP_MIGN","CAMP_MIGR","CAMP_MIGD","CAMP_LEPN","CAMP_LEPR","CAMP_LEPD","CAMP_IMMN","CAMP_IMMR","CAMP_IMMD","CAMP_GFTN","CAMP_GFTR","CAMP_GFTD","CAMP_ECNN","CAMP_ECNR","CAMP_ECND","CAMP_CTEN","CAMP_CTER","CAMP_CTED","CAMP_TTLN","CAMP_TTLR","CAMP_TTLD","CAMP_BEN","CAMP_BER","CAMP_BED","CAMP_ATRN","CAMP_ATRR","CAMP_ATRD","CAMP_MUN","CAMP_MUR","CAMP_MUD","CAMP_PIN","CAMP_PIR","CAMP_PID","CAMP_WHN","CAMP_WHR","CAMP_WHD","CAMP_NAN","CAMP_NAR","CAMP_NAD","CAMP_HSN","CAMP_HSR","CAMP_HSD","CAMP_ASN","CAMP_ASR","CAMP_ASD","CAMP_AAN","CAMP_AAR","CAMP_AAD","CAMP_FEMN","CAMP_FEMR","CAMP_FEMD","CAMP_MALN","CAMP_MALR","CAMP_MALD","CAMP_ALLN","CAMP_ALLR","CAMP_ALLD","DISTRICT","Gradespan","campus","cntyname","regnname","campname","distname","COUNTY","REGION"
"0","0.0","28","-","3.7","<100",,,,,,,,,,"0","0.0","24","-","1.5","<100","0","0.0","45","-","1.4","<300",,,,"-","4.4","<100","0","0.0","10",,,,"-","1.8","<200",,,,"0","0.0","9","0","0.0","1","0","0.0","29","-","2.3","<100","-","0.8","<200","-","1.4","<300","001902","712","001902001","Anderson County","Kilgore","Cayuga H S","Cayuga ISD","001","07"
#
# create a sql script to create a table in postgres
# First step is extract the header
#
$ head -n 1 campus_dropout.csv > campus_header
#
# look at header
#
$ cat campus_header
#
# put each column name on a separate line and add sql type
# definition using sed
#
$ sed 's/,/ varchar,\
/g' campus_header > campus_dropout.sql
#
# look at output
#
$cat campus_dropout.sql
#
# Finish the script by adding a create table statement
# using your favorite editor or on the command line.
# Note that this version of sed does not have inplace
# editing, so a tmp file is used
#
$ sed '1s/^/CREATE TABLE campust_dropout_raw( /' campus_dropout.sql > tmp
$ sed '$s/$/ varchar);/' tmp > campus_dropout.sql
#
# Make a sql script to create a table with only the dropout rate
# column using sed
#
$ tr -s ',' '\n' < campus_header | sed -n '2~3p' > campus_dropout_rate.sql
# grab the first n items in a file
$ wc -l campus_dropout_rate.sql
$ head -n 21 campus_dropout_rate.sql > first
# grab the last n items in a file
$ tr -s ',' '\n' < campus_header | tail -n 9 > last
# merge the two files
cat first last > campus_dropout_rate.sql
# add CREATE TABLE and closing parentheses
$ sed '1s/^/CREATE TABLE campust_dropout_rate( /' campus_dropout_rate.sql > tmp
$ sed '$s/$/ varchar);/' tmp > campus_dropout_rate.sql
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment