spara · February 3, 2012 14:42
diff --git a/clean_data.sh b/clean_data.sh
 #
 # Look at the first 5 lines of a file
 #
 $ head -n 2 campus_dropout.csv 
 "CAMP_SPEN","CAMP_SPER","CAMP_SPED","CAMP_OVRN","CAMP_OVRR","CAMP_OVRD","CAMP_MIGN","CAMP_MIGR","CAMP_MIGD","CAMP_LEPN","CAMP_LEPR","CAMP_LEPD","CAMP_IMMN","CAMP_IMMR","CAMP_IMMD","CAMP_GFTN","CAMP_GFTR","CAMP_GFTD","CAMP_ECNN","CAMP_ECNR","CAMP_ECND","CAMP_CTEN","CAMP_CTER","CAMP_CTED","CAMP_TTLN","CAMP_TTLR","CAMP_TTLD","CAMP_BEN","CAMP_BER","CAMP_BED","CAMP_ATRN","CAMP_ATRR","CAMP_ATRD","CAMP_MUN","CAMP_MUR","CAMP_MUD","CAMP_PIN","CAMP_PIR","CAMP_PID","CAMP_WHN","CAMP_WHR","CAMP_WHD","CAMP_NAN","CAMP_NAR","CAMP_NAD","CAMP_HSN","CAMP_HSR","CAMP_HSD","CAMP_ASN","CAMP_ASR","CAMP_ASD","CAMP_AAN","CAMP_AAR","CAMP_AAD","CAMP_FEMN","CAMP_FEMR","CAMP_FEMD","CAMP_MALN","CAMP_MALR","CAMP_MALD","CAMP_ALLN","CAMP_ALLR","CAMP_ALLD","DISTRICT","Gradespan","campus","cntyname","regnname","campname","distname","COUNTY","REGION"
 "0","0.0","28","-","3.7","<100",".",".",".",".",".",".",".",".",".","0","0.0","24","-","1.5","<100","0","0.0","45","-","1.4","<300",".",".",".","-","4.4","<100","0","0.0","10",".",".",".","-","1.8","<200",".",".",".","0","0.0","9","0","0.0","1","0","0.0","29","-","2.3","<100","-","0.8","<200","-","1.4","<300","001902","712","001902001","Anderson County","Kilgore","Cayuga H S","Cayuga ISD","001","07"

 #
 # look at the last line of the file
 #
 $ tail -n 1 campus_dropout.csv 
 "0","0.0","31","-","12.5","<100","0","0.0","39","0","0.0","12",".",".",".","0","0.0","20","-","1.3","<200","-","1.1","<100","-","1.5","<300","0","0.0","11","-","1.5","<200","0","0.0","1",".",".",".","0","0.0","12",".",".",".","-","2.1","<200",".",".",".",".",".",".","-","2.9","<200","-","1.0","<200","-","1.9","<300","254902","912","254902001","Zavala County","San Antonio","La Pryor H S","La Pryor ISD","254","20"


 #
 # replace the "." values with nulls using sed and write to
 # another file
 #
 sed 's/\"\.\"//g' campus_dropout.csv > campus_dropout_clean.csv

 #
 # check the output of the new file
 #
 $ head -n 2 campus_dropout_clean.csv 
 "CAMP_SPEN","CAMP_SPER","CAMP_SPED","CAMP_OVRN","CAMP_OVRR","CAMP_OVRD","CAMP_MIGN","CAMP_MIGR","CAMP_MIGD","CAMP_LEPN","CAMP_LEPR","CAMP_LEPD","CAMP_IMMN","CAMP_IMMR","CAMP_IMMD","CAMP_GFTN","CAMP_GFTR","CAMP_GFTD","CAMP_ECNN","CAMP_ECNR","CAMP_ECND","CAMP_CTEN","CAMP_CTER","CAMP_CTED","CAMP_TTLN","CAMP_TTLR","CAMP_TTLD","CAMP_BEN","CAMP_BER","CAMP_BED","CAMP_ATRN","CAMP_ATRR","CAMP_ATRD","CAMP_MUN","CAMP_MUR","CAMP_MUD","CAMP_PIN","CAMP_PIR","CAMP_PID","CAMP_WHN","CAMP_WHR","CAMP_WHD","CAMP_NAN","CAMP_NAR","CAMP_NAD","CAMP_HSN","CAMP_HSR","CAMP_HSD","CAMP_ASN","CAMP_ASR","CAMP_ASD","CAMP_AAN","CAMP_AAR","CAMP_AAD","CAMP_FEMN","CAMP_FEMR","CAMP_FEMD","CAMP_MALN","CAMP_MALR","CAMP_MALD","CAMP_ALLN","CAMP_ALLR","CAMP_ALLD","DISTRICT","Gradespan","campus","cntyname","regnname","campname","distname","COUNTY","REGION"
 "0","0.0","28","-","3.7","<100",,,,,,,,,,"0","0.0","24","-","1.5","<100","0","0.0","45","-","1.4","<300",,,,"-","4.4","<100","0","0.0","10",,,,"-","1.8","<200",,,,"0","0.0","9","0","0.0","1","0","0.0","29","-","2.3","<100","-","0.8","<200","-","1.4","<300","001902","712","001902001","Anderson County","Kilgore","Cayuga H S","Cayuga ISD","001","07"

 #
 # create a sql script to create a table in postgres
 # First step is extract the header
 #
 $ head -n 1 campus_dropout.csv > campus_header

 #
 # look at header
 #
 $ cat campus_header

 # 
 # put each column name on a separate line and add sql type
 # definition using sed
 #
 $ sed 's/,/ varchar,\
 /g' campus_header > campus_dropout.sql

 #
 # look at output
 #
 $cat campus_dropout.sql

 #
 # Finish the script by adding a create table statement
 # using your favorite editor or on the command line.
 # Note that this version of sed does not have inplace 
 # editing, so a tmp file is used
 #
 $ sed '1s/^/CREATE TABLE campust_dropout_raw( /' campus_dropout.sql > tmp
 $ sed '$s/$/ varchar);/' tmp > campus_dropout.sql

 #
 # Make a sql script to create a table with only the dropout rate 
 # column using sed
 #
 $  tr -s ',' '\n' < campus_header | sed -n '2~3p' > campus_dropout_rate.sql

 # grab the first n items in a file
 $ wc -l campus_dropout_rate.sql
 $ head -n 21 campus_dropout_rate.sql > first

 # grab the last n items in a file
 $  tr -s ',' '\n' < campus_header | tail -n 9 > last

 # merge the two files
 cat first last > campus_dropout_rate.sql

 # add CREATE TABLE and closing parentheses
 $ sed '1s/^/CREATE TABLE campust_dropout_rate( /' campus_dropout_rate.sql > tmp
 $ sed '$s/$/ varchar);/' tmp > campus_dropout_rate.sql
	#
	# Look at the first 5 lines of a file
	#
	$ head -n 2 campus_dropout.csv
	"CAMP_SPEN","CAMP_SPER","CAMP_SPED","CAMP_OVRN","CAMP_OVRR","CAMP_OVRD","CAMP_MIGN","CAMP_MIGR","CAMP_MIGD","CAMP_LEPN","CAMP_LEPR","CAMP_LEPD","CAMP_IMMN","CAMP_IMMR","CAMP_IMMD","CAMP_GFTN","CAMP_GFTR","CAMP_GFTD","CAMP_ECNN","CAMP_ECNR","CAMP_ECND","CAMP_CTEN","CAMP_CTER","CAMP_CTED","CAMP_TTLN","CAMP_TTLR","CAMP_TTLD","CAMP_BEN","CAMP_BER","CAMP_BED","CAMP_ATRN","CAMP_ATRR","CAMP_ATRD","CAMP_MUN","CAMP_MUR","CAMP_MUD","CAMP_PIN","CAMP_PIR","CAMP_PID","CAMP_WHN","CAMP_WHR","CAMP_WHD","CAMP_NAN","CAMP_NAR","CAMP_NAD","CAMP_HSN","CAMP_HSR","CAMP_HSD","CAMP_ASN","CAMP_ASR","CAMP_ASD","CAMP_AAN","CAMP_AAR","CAMP_AAD","CAMP_FEMN","CAMP_FEMR","CAMP_FEMD","CAMP_MALN","CAMP_MALR","CAMP_MALD","CAMP_ALLN","CAMP_ALLR","CAMP_ALLD","DISTRICT","Gradespan","campus","cntyname","regnname","campname","distname","COUNTY","REGION"
	"0","0.0","28","-","3.7","<100",".",".",".",".",".",".",".",".",".","0","0.0","24","-","1.5","<100","0","0.0","45","-","1.4","<300",".",".",".","-","4.4","<100","0","0.0","10",".",".",".","-","1.8","<200",".",".",".","0","0.0","9","0","0.0","1","0","0.0","29","-","2.3","<100","-","0.8","<200","-","1.4","<300","001902","712","001902001","Anderson County","Kilgore","Cayuga H S","Cayuga ISD","001","07"

	#
	# look at the last line of the file
	#
	$ tail -n 1 campus_dropout.csv
	"0","0.0","31","-","12.5","<100","0","0.0","39","0","0.0","12",".",".",".","0","0.0","20","-","1.3","<200","-","1.1","<100","-","1.5","<300","0","0.0","11","-","1.5","<200","0","0.0","1",".",".",".","0","0.0","12",".",".",".","-","2.1","<200",".",".",".",".",".",".","-","2.9","<200","-","1.0","<200","-","1.9","<300","254902","912","254902001","Zavala County","San Antonio","La Pryor H S","La Pryor ISD","254","20"


	#
	# replace the "." values with nulls using sed and write to
	# another file
	#
	sed 's/\"\.\"//g' campus_dropout.csv > campus_dropout_clean.csv

	#
	# check the output of the new file
	#
	$ head -n 2 campus_dropout_clean.csv
	"CAMP_SPEN","CAMP_SPER","CAMP_SPED","CAMP_OVRN","CAMP_OVRR","CAMP_OVRD","CAMP_MIGN","CAMP_MIGR","CAMP_MIGD","CAMP_LEPN","CAMP_LEPR","CAMP_LEPD","CAMP_IMMN","CAMP_IMMR","CAMP_IMMD","CAMP_GFTN","CAMP_GFTR","CAMP_GFTD","CAMP_ECNN","CAMP_ECNR","CAMP_ECND","CAMP_CTEN","CAMP_CTER","CAMP_CTED","CAMP_TTLN","CAMP_TTLR","CAMP_TTLD","CAMP_BEN","CAMP_BER","CAMP_BED","CAMP_ATRN","CAMP_ATRR","CAMP_ATRD","CAMP_MUN","CAMP_MUR","CAMP_MUD","CAMP_PIN","CAMP_PIR","CAMP_PID","CAMP_WHN","CAMP_WHR","CAMP_WHD","CAMP_NAN","CAMP_NAR","CAMP_NAD","CAMP_HSN","CAMP_HSR","CAMP_HSD","CAMP_ASN","CAMP_ASR","CAMP_ASD","CAMP_AAN","CAMP_AAR","CAMP_AAD","CAMP_FEMN","CAMP_FEMR","CAMP_FEMD","CAMP_MALN","CAMP_MALR","CAMP_MALD","CAMP_ALLN","CAMP_ALLR","CAMP_ALLD","DISTRICT","Gradespan","campus","cntyname","regnname","campname","distname","COUNTY","REGION"
	"0","0.0","28","-","3.7","<100",,,,,,,,,,"0","0.0","24","-","1.5","<100","0","0.0","45","-","1.4","<300",,,,"-","4.4","<100","0","0.0","10",,,,"-","1.8","<200",,,,"0","0.0","9","0","0.0","1","0","0.0","29","-","2.3","<100","-","0.8","<200","-","1.4","<300","001902","712","001902001","Anderson County","Kilgore","Cayuga H S","Cayuga ISD","001","07"

	#
	# create a sql script to create a table in postgres
	# First step is extract the header
	#
	$ head -n 1 campus_dropout.csv > campus_header

	#
	# look at header
	#
	$ cat campus_header

	#
	# put each column name on a separate line and add sql type
	# definition using sed
	#
	$ sed 's/,/ varchar,\
	/g' campus_header > campus_dropout.sql

	#
	# look at output
	#
	$cat campus_dropout.sql

	#
	# Finish the script by adding a create table statement
	# using your favorite editor or on the command line.
	# Note that this version of sed does not have inplace
	# editing, so a tmp file is used
	#
	$ sed '1s/^/CREATE TABLE campust_dropout_raw( /' campus_dropout.sql > tmp
	$ sed '$s/$/ varchar);/' tmp > campus_dropout.sql

	#
	# Make a sql script to create a table with only the dropout rate
	# column using sed
	#
	$ tr -s ',' '\n' < campus_header \| sed -n '2~3p' > campus_dropout_rate.sql

	# grab the first n items in a file
	$ wc -l campus_dropout_rate.sql
	$ head -n 21 campus_dropout_rate.sql > first

	# grab the last n items in a file
	$ tr -s ',' '\n' < campus_header \| tail -n 9 > last

	# merge the two files
	cat first last > campus_dropout_rate.sql

	# add CREATE TABLE and closing parentheses
	$ sed '1s/^/CREATE TABLE campust_dropout_rate( /' campus_dropout_rate.sql > tmp
	$ sed '$s/$/ varchar);/' tmp > campus_dropout_rate.sql
No results found