-
-
Save nishad/a5a6dfdc46fc7e0b4289 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Install csvkit with csvpys | |
# (csvpys hasn't been pulled back into the main csvkit repo that you can pip install) | |
git clone https://github.com/cypreess/csvkit.git | |
cd csvkit | |
python setup.py build | |
sudo python setup.py install | |
cd ~ | |
# Depending on how you're using Python, you might need to add the install path to your PATH | |
nano ~/.bash_profile | |
# add /usr/local/share/python or whatever directory the tools are living in | |
# not sure? try sudo find / -name csvcut | |
# Reload your PATH | |
source ~/.bash_profile | |
# Normalize your data, if necessary | |
csvclean data.csv | |
# Slice off any columns you won't be needing, if necessary | |
csvcut -c 2,3,4,5 data_out.csv > cut.csv | |
# Remove any extra header rows that can't handle the python string operations you're about to throw | |
# This example deletes row 2 | |
sed 2d cut.csv > sed.csv | |
# You can also do sed -i to do "in place" replacement, but sed is a dangerous tool | |
# so proceed carefully and don't mangle your data | |
# Do multiple cleanups at once | |
csvpys -s NEW-COL-NAME "c[2].strip()" -s SECOND-NEW-COL-NAME "c[3].strip()" -s THIRD-NEW-COL-NAME "int(c[4].strip())" -s FOURTH-NEW-COL-NAME "c[1].strip()" sed.csv > strip.csv | |
# NB: you can wrap a field in int() to make it an integer but not in strip() to remove whitespace | |
# Strip only works on objects that are already strings. | |
# Remove the old columns that csvpys keeps, and reorder any columns as necessary | |
csvcut -c 8,5,6,7 strip.csv > reorder.csv | |
# Not sure of your column IDs? Try csvcut -n strip.csv | |
# Integrity check: does number of rows match original number of rows? | |
wc -l reorder.csv | |
# should match | |
wc -l sed.csv | |
# Remove rows where a column (col 2 in this case) is empty | |
csvgrep -c 2 -r '^.' reorder.csv > blanks_removed.csv | |
# -r '^.' matches any character at the beginning of the field | |
# possible bug in csvkit: -m '' -i is not a valid way to match nonblanks | |
# Replace the header row with column names of your choosing | |
cat blanks_removed.csv | sed '1 s/.*/renamed_column,renamed_col2,third_renamed_column,col_no_four/' > headers.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment