WilliamQLiu · August 29, 2015 14:13
diff --git a/csvkit_tutorial b/csvkit_tutorial
 # From https://csvkit.readthedocs.org

 # Shows you stats about the file (e.g. col names, type, nulls, min, max, median, std deviation, unique values, most frequent)
 $csvstat myfile.csv

 # Peak at excel file to display in terminal
 $in2csv myfile.xlsx
 $in2csv myfile.xlsx > myfile.csv  # can write xlsx file to csv by chaining operations

 # Look at data, formats nicely with borders
 $csvlook myfile.csv

 # See column names
 $csvcut -n myfile.csv

 # Cut out specific columns
 $csvcut -c 2,5,6 myfile.csv  # Cut out columns to just 2,5,6
 $csvcut -c col1,col2,col3 myfile.csv  # can also specify by name (make sure not to leave spaces)

 # Chain commands to pass data along using the pipeline symbol |
 $csvcut -c RespondentID,CollectorID,StartDate,EndDate Sheet_1.csv | csvlook | head

 # Find cells matching a regular expression (e.g. pattern "ddd-123-dddd")
 $csvgrep -c phone_number -r "\d{3}-123-\d{4}" mydata.csv > matchrx.csv

 # Merge csv files together so you can do aggregate analysis (assuming they have the same headers)
 $csvstack firstfile.csv secondfile.csv > combinedfile.csv
 # You can add an additional '-g' flag on csvstack to add a 'grouping column' (e.g. it'll say firstfile, secondfile)

 # Execute a SQL query directly on a CSV file
 $csvsql --query "SELECT * FROM mydata WHERE AGE > 30" mydata.csv > query_results.csv

 # Sort data using csvsort
 $csvcut -c StartDate,EndDate myfile.csv | csvsort -c StartDate | csvlook | head  # use -r to reverse order (i.e. desc) for csvsort

 # Execute a SQL query by referencing two CSV files and doing a SQL Join
 $csvsql --query "select m.usda_id, avg(i.sepal_length) as mean_sepal_length from iris as i join irismeta as m on (i.species = m.species) group by m.species" examples/iris.csv examples/irismeta.csv

 # Turn data (with a unique id col) to be used as a JSON lookup table (or GeoJSON if coordinates available)
 #[myfile.csv]
 #slug,place,latitude,longitude
 #dcl,Downtown Coffee Lounge,32.35066,-95.30181
 #tyler-museum,Tyler Museum of Art,32.33396,-95.28174
 $csvjson --key slug --indent 4 myfile.csv  # Convert from CSV to JSON lookup
 $csvjson --lat latitude --lon longitude --key slug --crs EPSG:4269 --indent 4 myfile.csv > myfile.json  # Convert from CSV to GeoJSON

 # Generate a create table statement for your csv data
 $csvsql -i sqlite myfile.csv  # Can specify type of db with '-i' flag, other db's include mysql, mssql

 # Automatically create a SQL table and import a CSV into the database (postgresql)
 $createdb dbname
 $csvsql --db postgresql://username:password@localhost/dbname --insert mydata.csv
 $psql -q dbname -c "\d mydata"  # shows table, col names, data types
 $psql -q dbname -c "SELECT * FROM mydata"

 $ Extract a table from a SQL database into a CSV
 sql2csv --db postgresql://username:password@localhost/dbname --query "select * from mydata" > extract.csv
 sql2csv --db mssql://username:password@servername/dbname --query "select top 100 * from [dbname].[dbo].[tablename]" > extract.csv
	# From https://csvkit.readthedocs.org

	# Shows you stats about the file (e.g. col names, type, nulls, min, max, median, std deviation, unique values, most frequent)
	$csvstat myfile.csv

	# Peak at excel file to display in terminal
	$in2csv myfile.xlsx
	$in2csv myfile.xlsx > myfile.csv # can write xlsx file to csv by chaining operations

	# Look at data, formats nicely with borders
	$csvlook myfile.csv

	# See column names
	$csvcut -n myfile.csv

	# Cut out specific columns
	$csvcut -c 2,5,6 myfile.csv # Cut out columns to just 2,5,6
	$csvcut -c col1,col2,col3 myfile.csv # can also specify by name (make sure not to leave spaces)

	# Chain commands to pass data along using the pipeline symbol \|
	$csvcut -c RespondentID,CollectorID,StartDate,EndDate Sheet_1.csv \| csvlook \| head

	# Find cells matching a regular expression (e.g. pattern "ddd-123-dddd")
	$csvgrep -c phone_number -r "\d{3}-123-\d{4}" mydata.csv > matchrx.csv

	# Merge csv files together so you can do aggregate analysis (assuming they have the same headers)
	$csvstack firstfile.csv secondfile.csv > combinedfile.csv
	# You can add an additional '-g' flag on csvstack to add a 'grouping column' (e.g. it'll say firstfile, secondfile)

	# Execute a SQL query directly on a CSV file
	$csvsql --query "SELECT * FROM mydata WHERE AGE > 30" mydata.csv > query_results.csv

	# Sort data using csvsort
	$csvcut -c StartDate,EndDate myfile.csv \| csvsort -c StartDate \| csvlook \| head # use -r to reverse order (i.e. desc) for csvsort

	# Execute a SQL query by referencing two CSV files and doing a SQL Join
	$csvsql --query "select m.usda_id, avg(i.sepal_length) as mean_sepal_length from iris as i join irismeta as m on (i.species = m.species) group by m.species" examples/iris.csv examples/irismeta.csv

	# Turn data (with a unique id col) to be used as a JSON lookup table (or GeoJSON if coordinates available)
	#[myfile.csv]
	#slug,place,latitude,longitude
	#dcl,Downtown Coffee Lounge,32.35066,-95.30181
	#tyler-museum,Tyler Museum of Art,32.33396,-95.28174
	$csvjson --key slug --indent 4 myfile.csv # Convert from CSV to JSON lookup
	$csvjson --lat latitude --lon longitude --key slug --crs EPSG:4269 --indent 4 myfile.csv > myfile.json # Convert from CSV to GeoJSON

	# Generate a create table statement for your csv data
	$csvsql -i sqlite myfile.csv # Can specify type of db with '-i' flag, other db's include mysql, mssql

	# Automatically create a SQL table and import a CSV into the database (postgresql)
	$createdb dbname
	$csvsql --db postgresql://username:password@localhost/dbname --insert mydata.csv
	$psql -q dbname -c "\d mydata" # shows table, col names, data types
	$psql -q dbname -c "SELECT * FROM mydata"

	$ Extract a table from a SQL database into a CSV
	sql2csv --db postgresql://username:password@localhost/dbname --query "select * from mydata" > extract.csv
	sql2csv --db mssql://username:password@servername/dbname --query "select top 100 * from [dbname].[dbo].[tablename]" > extract.csv