Skip to content

Instantly share code, notes, and snippets.

@ian-weisser
Created June 13, 2014 21:46
Show Gist options
  • Save ian-weisser/10871a4b80554e19eb6e to your computer and use it in GitHub Desktop.
Save ian-weisser/10871a4b80554e19eb6e to your computer and use it in GitHub Desktop.
Load stop_times.txt into a python dict
def map_stop_times_to_dict(gtfs_file, list_of_stops):
"""
The stop_times.txt table can be *huge*, and can eat all your RAM
as 100MB of string data gets split into 1GB of dict data.
Instead, load the table in chunks, and filter it for the content
we want before loading the next chunk.
Reducing the huge stop_times file by ~95% also has the side effect
of cutting the script runtime in half.
The content we want is:
1) The data of each trip that serves a stop we want
2) The final stop of each of those trips
"""
table_data = {}
counter = 0
trip_id = 0
keep = False
last_line_dict = {}
eof = False
with gtfs_file.open('stop_times.txt', mode='r') as infile:
# First line is the header.
# Parse the header to determine the column names
columns = {}
header = infile.readline().decode('utf-8').split(',')
for field_name in header:
#if field_name.strip('\r\n') in ['stop_id', 'trip_id']:
columns[field_name.strip('\r\n')] = header.index(field_name)
# Remaining lines of the file, in chunks
# Convert fields to dict based on the column names
while eof == False:
lines_list = infile.readlines(4096)
if len(lines_list) == 0:
eof = True
for line in lines_list:
if len(line.decode('utf-8').split(',')) \
< len(columns): # Non-data lines
continue
# Create the dict of each line's data
counter += 1
line_dict = {}
for column in columns:
line_dict[column] = line.decode('utf-8').strip(
'\r\n "').split(',')[columns[column]].strip('" ')
# If trip_id changes, write the final line of the
# old trip. Captures the max stop_sequence number
if trip_id != line_dict['trip_id'] \
and counter > 1:
if keep:
table_data.update({ counter - 1 : last_line_dict })
trip_id = line_dict['trip_id']
keep = False
# Preserve stop_time data if it's a stop we care about.
if line_dict['stop_id'] in list_of_stops:
table_data.update({ counter : line_dict })
keep = True
# Save the line_dict in case it turns out to be the
# final stop of the trip
last_line_dict = line_dict
print(len(table_data))
return table_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment