Created
June 13, 2014 21:46
-
-
Save ian-weisser/10871a4b80554e19eb6e to your computer and use it in GitHub Desktop.
Load stop_times.txt into a python dict
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def map_stop_times_to_dict(gtfs_file, list_of_stops): | |
""" | |
The stop_times.txt table can be *huge*, and can eat all your RAM | |
as 100MB of string data gets split into 1GB of dict data. | |
Instead, load the table in chunks, and filter it for the content | |
we want before loading the next chunk. | |
Reducing the huge stop_times file by ~95% also has the side effect | |
of cutting the script runtime in half. | |
The content we want is: | |
1) The data of each trip that serves a stop we want | |
2) The final stop of each of those trips | |
""" | |
table_data = {} | |
counter = 0 | |
trip_id = 0 | |
keep = False | |
last_line_dict = {} | |
eof = False | |
with gtfs_file.open('stop_times.txt', mode='r') as infile: | |
# First line is the header. | |
# Parse the header to determine the column names | |
columns = {} | |
header = infile.readline().decode('utf-8').split(',') | |
for field_name in header: | |
#if field_name.strip('\r\n') in ['stop_id', 'trip_id']: | |
columns[field_name.strip('\r\n')] = header.index(field_name) | |
# Remaining lines of the file, in chunks | |
# Convert fields to dict based on the column names | |
while eof == False: | |
lines_list = infile.readlines(4096) | |
if len(lines_list) == 0: | |
eof = True | |
for line in lines_list: | |
if len(line.decode('utf-8').split(',')) \ | |
< len(columns): # Non-data lines | |
continue | |
# Create the dict of each line's data | |
counter += 1 | |
line_dict = {} | |
for column in columns: | |
line_dict[column] = line.decode('utf-8').strip( | |
'\r\n "').split(',')[columns[column]].strip('" ') | |
# If trip_id changes, write the final line of the | |
# old trip. Captures the max stop_sequence number | |
if trip_id != line_dict['trip_id'] \ | |
and counter > 1: | |
if keep: | |
table_data.update({ counter - 1 : last_line_dict }) | |
trip_id = line_dict['trip_id'] | |
keep = False | |
# Preserve stop_time data if it's a stop we care about. | |
if line_dict['stop_id'] in list_of_stops: | |
table_data.update({ counter : line_dict }) | |
keep = True | |
# Save the line_dict in case it turns out to be the | |
# final stop of the trip | |
last_line_dict = line_dict | |
print(len(table_data)) | |
return table_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment