Last active
August 29, 2015 14:13
-
-
Save WilliamQLiu/bc9e1076fa4b4680a42a to your computer and use it in GitHub Desktop.
Useful csvkit commands
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From https://csvkit.readthedocs.org | |
# Shows you stats about the file (e.g. col names, type, nulls, min, max, median, std deviation, unique values, most frequent) | |
$csvstat myfile.csv | |
# Peak at excel file to display in terminal | |
$in2csv myfile.xlsx | |
$in2csv myfile.xlsx > myfile.csv # can write xlsx file to csv by chaining operations | |
# Look at data, formats nicely with borders | |
$csvlook myfile.csv | |
# See column names | |
$csvcut -n myfile.csv | |
# Cut out specific columns | |
$csvcut -c 2,5,6 myfile.csv # Cut out columns to just 2,5,6 | |
$csvcut -c col1,col2,col3 myfile.csv # can also specify by name (make sure not to leave spaces) | |
# Chain commands to pass data along using the pipeline symbol | | |
$csvcut -c RespondentID,CollectorID,StartDate,EndDate Sheet_1.csv | csvlook | head | |
# Find cells matching a regular expression (e.g. pattern "ddd-123-dddd") | |
$csvgrep -c phone_number -r "\d{3}-123-\d{4}" mydata.csv > matchrx.csv | |
# Merge csv files together so you can do aggregate analysis (assuming they have the same headers) | |
$csvstack firstfile.csv secondfile.csv > combinedfile.csv | |
# You can add an additional '-g' flag on csvstack to add a 'grouping column' (e.g. it'll say firstfile, secondfile) | |
# Execute a SQL query directly on a CSV file | |
$csvsql --query "SELECT * FROM mydata WHERE AGE > 30" mydata.csv > query_results.csv | |
# Sort data using csvsort | |
$csvcut -c StartDate,EndDate myfile.csv | csvsort -c StartDate | csvlook | head # use -r to reverse order (i.e. desc) for csvsort | |
# Execute a SQL query by referencing two CSV files and doing a SQL Join | |
$csvsql --query "select m.usda_id, avg(i.sepal_length) as mean_sepal_length from iris as i join irismeta as m on (i.species = m.species) group by m.species" examples/iris.csv examples/irismeta.csv | |
# Turn data (with a unique id col) to be used as a JSON lookup table (or GeoJSON if coordinates available) | |
#[myfile.csv] | |
#slug,place,latitude,longitude | |
#dcl,Downtown Coffee Lounge,32.35066,-95.30181 | |
#tyler-museum,Tyler Museum of Art,32.33396,-95.28174 | |
$csvjson --key slug --indent 4 myfile.csv # Convert from CSV to JSON lookup | |
$csvjson --lat latitude --lon longitude --key slug --crs EPSG:4269 --indent 4 myfile.csv > myfile.json # Convert from CSV to GeoJSON | |
# Generate a create table statement for your csv data | |
$csvsql -i sqlite myfile.csv # Can specify type of db with '-i' flag, other db's include mysql, mssql | |
# Automatically create a SQL table and import a CSV into the database (postgresql) | |
$createdb dbname | |
$csvsql --db postgresql://username:password@localhost/dbname --insert mydata.csv | |
$psql -q dbname -c "\d mydata" # shows table, col names, data types | |
$psql -q dbname -c "SELECT * FROM mydata" | |
$ Extract a table from a SQL database into a CSV | |
sql2csv --db postgresql://username:password@localhost/dbname --query "select * from mydata" > extract.csv | |
sql2csv --db mssql://username:password@servername/dbname --query "select top 100 * from [dbname].[dbo].[tablename]" > extract.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment