Created
September 2, 2010 17:38
-
-
Save bycoffe/562606 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class FixedWidthParser(object): | |
"""A parser for fixed-width data files. Pass in a data file and | |
a list of field names and lengths, and get back a dictionary | |
for each row. | |
Useful for converting a fixed-width file to a CSV. | |
See tests.py for a usage example. | |
""" | |
def __init__(self, fields): | |
""" | |
fields: a list of tuples in the form (fieldname, (startchar, endchar)) | |
""" | |
self.fields = fields | |
def parse(self, fh): | |
""" | |
fh: a file-like object | |
""" | |
for line in fh: | |
data = {} | |
for fieldname, (start, end) in self.fields: | |
data[fieldname] = line[start-1:end] | |
yield data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from fixedwidthparser import FixedWidthParser | |
class TestFixedWidthParser(unittest.TestCase): | |
def test_parser(self): | |
try: | |
from cStringIO import StringIO | |
except ImportError: | |
from StringIO import StringIO | |
fields = [('filer_id', (1,9)), | |
('amendment', (10, 10)), | |
('report_type', (11, 13)), | |
('primary_general', (14, 14)), | |
('microfilm', (15, 25)), | |
('transaction_type', (26, 28)), | |
('contributor_name', (29, 62)), | |
('city', (63, 80)), | |
('state', (81, 82)), | |
('zipcode', (83, 87)), | |
('occupation', (88, 122)), | |
('month', (123, 124)), | |
('day', (125, 126)), | |
('century', (127, 128)), | |
('year', (129, 130)), | |
('amount', (131, 137)), | |
('other_id', (138, 146)), | |
('fec_record', (147, 153)), ] | |
data = """C00000042NYE P2893017625424KCongress for Judy Biggert Clarendon Hills IL60514 102220070002000C003302411367020 | |
C00000042NYE P2893017625424KFriends of John Boehner Hamilton OH45011 102220070005000C002371981367021 | |
C00000042NYE P2893017625424KCapito for Congress Charleston WV25314 102220070001000C003478491367022 | |
C00000042NYE P2893017625524KCongressman for John Carter Round Rock TX78664 102220070001000C003712031367023""" | |
fh = StringIO(data) | |
parser = FixedWidthParser(fields) | |
for row in parser.parse(fh): | |
self.assertEqual(row['century'], '20') | |
self.assertEqual(row['year'], '07') | |
self.assertTrue(row['city'].strip() in ['Clarendon Hills', 'Hamilton', 'Charleston', 'Round Rock']) | |
print row | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment