ian-weisser · June 13, 2014 21:46
diff --git a/load_stop_times b/load_stop_times
 def map_stop_times_to_dict(gtfs_file, list_of_stops):
    """
    The stop_times.txt table can be *huge*, and can eat all your RAM
    as 100MB of string data gets split into 1GB of dict data.

    Instead, load the table in chunks, and filter it for the content
    we want before loading the next chunk.

    Reducing the huge stop_times file by ~95% also has the side effect
    of cutting the script runtime in half.

    The content we want is:
    1) The data of each trip that serves a stop we want
    2) The final stop of each of those trips
    """

    table_data     = {}
    counter        = 0
    trip_id        = 0
    keep           = False
    last_line_dict = {}

    eof = False
    with gtfs_file.open('stop_times.txt', mode='r') as infile:

        # First line is the header.
        # Parse the header to determine the column names

        columns = {}
        header = infile.readline().decode('utf-8').split(',')
        for field_name in header:
            #if field_name.strip('\r\n') in ['stop_id', 'trip_id']:
            columns[field_name.strip('\r\n')] = header.index(field_name)



        # Remaining lines of the file, in chunks
        # Convert fields to dict based on the column names

        while eof == False:
            lines_list = infile.readlines(4096)
            if len(lines_list) == 0:
                eof = True

            for line in lines_list:

                if len(line.decode('utf-8').split(',')) \
                    < len(columns):  # Non-data lines

                    continue

                # Create the dict of each line's data

                counter += 1
                line_dict = {}
                for column in columns:
                    line_dict[column] = line.decode('utf-8').strip(
                    '\r\n "').split(',')[columns[column]].strip('" ')

                # If trip_id changes, write the final line of the
                #   old trip. Captures the max stop_sequence number

                if trip_id != line_dict['trip_id'] \
                and counter > 1:
                    if keep:
                        table_data.update({ counter - 1 : last_line_dict })           
                    trip_id = line_dict['trip_id']
                    keep    = False
                
                # Preserve stop_time data if it's a stop we care about.       

                if line_dict['stop_id'] in list_of_stops:
                    table_data.update({ counter : line_dict })
                    keep    = True

                # Save the line_dict in case it turns out to be the
                #   final stop of the trip

                last_line_dict = line_dict

    print(len(table_data))
    return table_data
	def map_stop_times_to_dict(gtfs_file, list_of_stops):
	"""
	The stop_times.txt table can be huge, and can eat all your RAM
	as 100MB of string data gets split into 1GB of dict data.

	Instead, load the table in chunks, and filter it for the content
	we want before loading the next chunk.

	Reducing the huge stop_times file by ~95% also has the side effect
	of cutting the script runtime in half.

	The content we want is:
	1) The data of each trip that serves a stop we want
	2) The final stop of each of those trips
	"""

	table_data = {}
	counter = 0
	trip_id = 0
	keep = False
	last_line_dict = {}

	eof = False
	with gtfs_file.open('stop_times.txt', mode='r') as infile:

	# First line is the header.
	# Parse the header to determine the column names

	columns = {}
	header = infile.readline().decode('utf-8').split(',')
	for field_name in header:
	#if field_name.strip('\r\n') in ['stop_id', 'trip_id']:
	columns[field_name.strip('\r\n')] = header.index(field_name)



	# Remaining lines of the file, in chunks
	# Convert fields to dict based on the column names

	while eof == False:
	lines_list = infile.readlines(4096)
	if len(lines_list) == 0:
	eof = True

	for line in lines_list:

	if len(line.decode('utf-8').split(',')) \
	< len(columns): # Non-data lines

	continue

	# Create the dict of each line's data

	counter += 1
	line_dict = {}
	for column in columns:
	line_dict[column] = line.decode('utf-8').strip(
	'\r\n "').split(',')[columns[column]].strip('" ')

	# If trip_id changes, write the final line of the
	# old trip. Captures the max stop_sequence number

	if trip_id != line_dict['trip_id'] \
	and counter > 1:
	if keep:
	table_data.update({ counter - 1 : last_line_dict })
	trip_id = line_dict['trip_id']
	keep = False

	# Preserve stop_time data if it's a stop we care about.

	if line_dict['stop_id'] in list_of_stops:
	table_data.update({ counter : line_dict })
	keep = True

	# Save the line_dict in case it turns out to be the
	# final stop of the trip

	last_line_dict = line_dict

	print(len(table_data))
	return table_data
No results found