Last active
July 28, 2016 13:32
-
-
Save ian-weisser/447b34bb9c399db365f5 to your computer and use it in GitHub Desktop.
Map GTFS table data into a python dict
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def map_gtfs_table_to_dict(gtfs_file, table_name): | |
""" | |
Read data from a GTFS table, map the data into a list or dict for | |
easier iteration or searching or other use. Each line of the table | |
is mapped into a separate subdict, with a unique key. | |
Many GTFS tables include a unique value (Example: trips_id in the | |
trips.txt table) that this function automatically uses as the | |
dict key. If a tables has no unique key (Example: calendar_dates.txt), | |
the system generates a unique key using an incrementing row counter. | |
WARNING: This function may run out of memory on a _large_ table. | |
Example: stop_times.txt from Chicago is routinely 30MB compressed, | |
182MB uncompressed, and almost 1GB exploded into a dict in RAM by this | |
function. | |
For example, one classic way of mapping a GTFS table to a set of dicts: | |
>>> gtfs_file = zipfile.ZipFile('foo.gtfs', mode='r') | |
>>> with gtfs_file.open('routes.txt', mode='r') as infile: | |
>>> lines = infile.read().decode('utf-8').split('\r\n') | |
>>> gtfs.close() | |
>>> columns = len(lines[0].split(',')) | |
>>> | |
>>> routes = {} | |
>>> for line in lines[1:]: | |
>>> if len(line.split(',')) < columns: | |
>>> continue | |
>>> route_id = line.split(',')[0].strip() | |
>>> route_short_name = line.split(',')[1].strip() | |
>>> route_long_name = line.split(',')[2] | |
>>> routes[route_id] = {'route_short_name':route_short_name, | |
>>> 'route_long_name' :route_long_name } | |
Is much simpler using this function: | |
>>> gtfs_file = zipfile.ZipFile('foo.gtfs', mode='r') | |
>>> routes_table = map_gtfs_table_to_dict(gtfs_file, 'routes.txt') | |
>>> gtfs.close() | |
>>> do_something_with(routes_table) | |
Inputs: | |
gtfs_file should be a zipfile object, not a file path or raw data: | |
gtfs_file = zipfile.ZipFile(gtfs_path, mode='r') | |
table_name should match one within the zipfile. It should match one of: | |
valid_names = gtfs_file.namelist() | |
Output is a bunch of dicts (one dict per data line) nested within | |
a container dict. | |
Example GTFS data: | |
service_id,monday,tuesday,wednesday,thursday,friday,saturday, | |
sunday,start_date,end_date | |
43301,1,1,1,1,1,0,0,20140515,20140518 | |
43302,1,1,1,1,0,0,0,20140515,20140518 | |
43303,0,1,1,1,1,0,0,20140515,20140518 | |
Example usage: | |
>>> import zipfile | |
>>> gtfs_path = '20140515.cta.gtfs' | |
>>> gtfs_file = zipfile.ZipFile(gtfs_path, mode='r') | |
>>> table_name = 'calendar.txt' | |
>>> map_gtfs_table_to_dict(gtfs_file, table_name) | |
{'43301': {'monday':'1', 'tuesday':'1', ... }, | |
'43302': {'monday':'1', 'tuesday':'1', ... }, | |
'43303': {'monday':'0', 'tuesday':'1', ... }, } | |
""" | |
table_data = {} | |
# Read the table file | |
with gtfs_file.open(table_name, mode='r') as infile: | |
lines_string = infile.read().decode('utf-8') | |
# Some GTFS makers use different line endings | |
if '\r\n' in lines_string: | |
lines = lines_string.split('\r\n') | |
else: | |
lines = lines_string.split('\n') | |
# Parse the header, mapping columns heading to the index() of the lines | |
columns = {} | |
header = lines[0].split(',') | |
for field_name in header: | |
columns[field_name] = header.index(field_name) | |
# The key is based on the table name | |
keys = { | |
'agency.txt' :'agency_id', 'calendar.txt' :'service_id', | |
'calendar_dates.txt': None, 'fare_attributes.txt':'fare_id', | |
'fare_rules.txt' : None, 'feed_info.txt' : None, | |
'frequencies.txt' : None, 'routes.txt' :'route_id', | |
'shapes.txt' :'shape_id', 'stops.txt' :'stop_id', | |
'stop_times.txt' : None, 'transfers.txt' : None, | |
'trips.txt' :'trip_id' } | |
if keys[table_name] is None: # Generate key | |
counter = -1 | |
# Iterate through each line of data, converting line into dict | |
for line in lines[1:]: | |
if len(line.split(',')) < len(columns): # Non-data lines | |
continue | |
# Create the dict of each line's data | |
line_dict = {} | |
for column in columns: | |
line_dict[column] = line.split(',')[columns[column]].strip('" ') | |
# Set the key to the line_dict, and add it to the main dict | |
if keys[table_name] is not None: # Has key | |
key_value = line_dict[keys[table_name]] | |
del line_dict[keys[table_name]] | |
table_data.update({ key_value : line_dict}) | |
else: # Generate key | |
counter += 1 | |
table_data.update({ counter : line_dict}) | |
return table_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a very nice and indeed helpful piece of code. Helped me with parsing GTFS. Have you considered using "csv" module to parse GTFS feeds?