Created
May 18, 2013 06:30
-
-
Save danabauer/5603478 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
!/usr/bin/env python | |
csv2pgsql.py | |
by Erik Osheim and Dana Bauer | |
June 2011 | |
""" | |
import csv | |
import os | |
import psycopg2 | |
import sys | |
# print an error message and exits | |
def explode(error): | |
print 'ERROR: %s' % error | |
sys.exit(1) | |
# this is the class that does our work | |
class Loader(object): | |
def __init__(self, dbname, tablename, columntypes): | |
self.dbname = dbname | |
self.conn = psycopg2.connect("dbname=" + dbname) | |
self.tablename = tablename | |
self.columntypes = columntypes | |
# this function imports a single CSV file into newly-created DB table | |
def load(self, csvpath): | |
# make sure we can read the CSV file | |
if not os.path.exists(csvpath): | |
explode("csv file %r could not be found" % csvpath) | |
# get a database cursor to do actual queries | |
cur = self.conn.cursor() | |
# open the csv file and being it | |
f = open('39_redist_data.csv', 'r') | |
reader = csv.reader(f) | |
# we expect the first line to be the column names | |
names = reader.next() | |
# read the second line (the first data row) to detect types | |
row = reader.next() | |
self.types = self.getrowtypes(names, row) | |
# create table | |
self.createtable(cur) | |
# load the first data row | |
self.loadrow(cur, row) | |
i = 2 | |
# load all the other data rows | |
for row in reader: | |
self.loadrow(cur, row) | |
i += 1 | |
if i % 100 == 0: | |
print i | |
# close the cursor and save our work in the DB | |
cur.close() | |
self.conn.commit() | |
# given a list of column names and a row of SQL, figures out what SQL | |
# data types to use for each and returns a list combining names and types. | |
# e.g. [("column1", "int"), ("column2", "text"), ...] | |
def getrowtypes(self, names, row): | |
types = [] | |
for name, val in zip(names, row): | |
typ = self.getvaltype(name, val) | |
types.append((name, typ)) | |
return types | |
# given a name and a value (from the row), figure out its type. | |
def getvaltype(self, name, val): | |
if name in self.columntypes: | |
return self.columntypes[name] | |
try: | |
int(val) | |
return 'int' | |
except: | |
pass | |
try: | |
float(val) | |
return 'float' | |
except: | |
pass | |
return 'text' | |
# build the table to store our CSV data in the database | |
def createtable(self, cur): | |
decls = [col.lower() + ' ' + val for col, val in self.types] | |
sql = 'create table ' | |
sql += self.tablename | |
sql += ' (' | |
sql += ', '.join(decls) | |
sql += ');' | |
# uncomment to see the create table SQL | |
#print sql | |
cur.execute(sql) | |
# build our insert (we'll use this for every row) | |
cols = [col for col, _ in self.types] | |
vals = ['%s' for t in self.types] | |
self.insert = 'insert into ' | |
self.insert += self.tablename | |
self.insert += ' (' | |
self.insert += ', '.join(cols) | |
self.insert += ') values (' | |
self.insert += ', '.join(vals) | |
self.insert += ')' | |
# uncomment to see the insert SQL | |
#print self.insert | |
def loadrow(self, cur, row): | |
for i in range(0, len(self.types)): | |
if self.types[i][1] == 'text': | |
continue | |
elif row[i] == '': | |
row[i] = None | |
elif not row[i].isdigit(): | |
explode("column %r not a number" % self.types[i][0]) | |
cur.execute(self.insert, row) | |
if __name__ == "__main__": | |
# change this to choose a different CSV file to use | |
csvpath = '39_redist_data.csv' | |
# change this to choose a different database to use | |
dbname = 'mdn' | |
# change this to choose a different table name to use | |
tablename = 'redist_data' | |
# change this to override the autodetected types. | |
# you might want this if a field only sometimes looks like a number. | |
columntypes = { | |
'GEOID10': 'text', | |
'SLDU': 'text', | |
'SLDL': 'text', | |
} | |
# instantiate the loader and load the data | |
l = Loader(dbname, tablename, columntypes) | |
l.load(csvpath) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment