Skip to content

Instantly share code, notes, and snippets.

@danabauer
Created May 18, 2013 06:30
Show Gist options
  • Save danabauer/5603478 to your computer and use it in GitHub Desktop.
Save danabauer/5603478 to your computer and use it in GitHub Desktop.
"""
!/usr/bin/env python
csv2pgsql.py
by Erik Osheim and Dana Bauer
June 2011
"""
import csv
import os
import psycopg2
import sys
# print an error message and exits
def explode(error):
print 'ERROR: %s' % error
sys.exit(1)
# this is the class that does our work
class Loader(object):
def __init__(self, dbname, tablename, columntypes):
self.dbname = dbname
self.conn = psycopg2.connect("dbname=" + dbname)
self.tablename = tablename
self.columntypes = columntypes
# this function imports a single CSV file into newly-created DB table
def load(self, csvpath):
# make sure we can read the CSV file
if not os.path.exists(csvpath):
explode("csv file %r could not be found" % csvpath)
# get a database cursor to do actual queries
cur = self.conn.cursor()
# open the csv file and being it
f = open('39_redist_data.csv', 'r')
reader = csv.reader(f)
# we expect the first line to be the column names
names = reader.next()
# read the second line (the first data row) to detect types
row = reader.next()
self.types = self.getrowtypes(names, row)
# create table
self.createtable(cur)
# load the first data row
self.loadrow(cur, row)
i = 2
# load all the other data rows
for row in reader:
self.loadrow(cur, row)
i += 1
if i % 100 == 0:
print i
# close the cursor and save our work in the DB
cur.close()
self.conn.commit()
# given a list of column names and a row of SQL, figures out what SQL
# data types to use for each and returns a list combining names and types.
# e.g. [("column1", "int"), ("column2", "text"), ...]
def getrowtypes(self, names, row):
types = []
for name, val in zip(names, row):
typ = self.getvaltype(name, val)
types.append((name, typ))
return types
# given a name and a value (from the row), figure out its type.
def getvaltype(self, name, val):
if name in self.columntypes:
return self.columntypes[name]
try:
int(val)
return 'int'
except:
pass
try:
float(val)
return 'float'
except:
pass
return 'text'
# build the table to store our CSV data in the database
def createtable(self, cur):
decls = [col.lower() + ' ' + val for col, val in self.types]
sql = 'create table '
sql += self.tablename
sql += ' ('
sql += ', '.join(decls)
sql += ');'
# uncomment to see the create table SQL
#print sql
cur.execute(sql)
# build our insert (we'll use this for every row)
cols = [col for col, _ in self.types]
vals = ['%s' for t in self.types]
self.insert = 'insert into '
self.insert += self.tablename
self.insert += ' ('
self.insert += ', '.join(cols)
self.insert += ') values ('
self.insert += ', '.join(vals)
self.insert += ')'
# uncomment to see the insert SQL
#print self.insert
def loadrow(self, cur, row):
for i in range(0, len(self.types)):
if self.types[i][1] == 'text':
continue
elif row[i] == '':
row[i] = None
elif not row[i].isdigit():
explode("column %r not a number" % self.types[i][0])
cur.execute(self.insert, row)
if __name__ == "__main__":
# change this to choose a different CSV file to use
csvpath = '39_redist_data.csv'
# change this to choose a different database to use
dbname = 'mdn'
# change this to choose a different table name to use
tablename = 'redist_data'
# change this to override the autodetected types.
# you might want this if a field only sometimes looks like a number.
columntypes = {
'GEOID10': 'text',
'SLDU': 'text',
'SLDL': 'text',
}
# instantiate the loader and load the data
l = Loader(dbname, tablename, columntypes)
l.load(csvpath)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment