Skip to content

Instantly share code, notes, and snippets.

@dln
Last active August 25, 2022 10:07
Show Gist options
  • Save dln/67d8b2215479ddff6e0942a170f1eddc to your computer and use it in GitHub Desktop.
Save dln/67d8b2215479ddff6e0942a170f1eddc to your computer and use it in GitHub Desktop.
csv processing with duckdb and arrow
#!/usr/bin/env python
import duckdb
import pyarrow as pa
import pyarrow.csv as csv
# curl -L https://www.stats.govt.nz/assets/Uploads/Annual-enterprise-survey/Annual-enterprise-survey-2021-financial-year-provisional/Download-data/annual-enterprise-survey-2021-financial-year-provisional-csv.csv | gzip > /tmp/test-header.csv.gz
data = csv.read_csv('/tmp/test-header.csv.gz')
con = duckdb.connect()
res = con.execute('SELECT Industry_name_NZSIOC,Year, count(*) FROM data GROUP BY Industry_name_NZSIOC, Year')
print(res.arrow())
#!/usr/bin/env python
import duckdb
import pyarrow as pa
import pyarrow.csv as csv
# curl -L https://www.stats.govt.nz/assets/Uploads/Annual-enterprise-survey/Annual-enterprise-survey-2021-financial-year-provisional/Download-data/annual-enterprise-survey-2021-financial-year-provisional-csv.csv | tail -n+2 | gzip > /tmp/test-noheader.csv.gz
data = csv.read_csv('/tmp/test-noheader.csv.gz',
read_options=csv.ReadOptions(
column_names=[
'Year',
'Industry_aggregation_NZSIOC',
'Industry_code_NZSIOC',
'Industry_name_NZSIOC',
'Units',
'Variable_code',
'Variable_name',
'Variable_category',
'Value',
'Industry_code_ANZSIC06',
]
),
convert_options=csv.ConvertOptions(
column_types={
'Year': pa.string(),
'Industry_aggregation_NZSIOC': pa.string(),
'Industry_code_NZSIOC': pa.string(),
'Industry_name_NZSIOC': pa.string(),
'Units': pa.string(),
'Variable_code': pa.string(),
'Variable_name': pa.string(),
'Variable_category': pa.string(),
'Value': pa.string(),
'Industry_code_ANZSIC06': pa.string(),
}
))
con = duckdb.connect()
res = con.execute('SELECT Industry_name_NZSIOC,Year, count(*) FROM data GROUP BY Industry_name_NZSIOC, Year')
print(res.arrow())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment