Last active
December 9, 2020 21:10
-
-
Save robert-b-clarke/5228381 to your computer and use it in GitHub Desktop.
Split a large gtfs filon e into separate ones for each agency - tested on http://www.datagm.org.uk/package/public-transport-schedules--gtfs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
=head1 NAME | |
gtfs-splitter.pl | |
=head1 DESCRIPTION | |
Hastily assembled and extremely unguaranteed perl script for dividing gtfs files into smaller per agency files | |
=head1 SYNOPSIS | |
from command line | |
> unzip BigGTFS.ZIP | |
> perl gtfs-splitter.pl BigGTFS SmallerGTFS | |
#then zip up the individual gtfs directories | |
=cut | |
#!/usr/bin/perl | |
use Modern::Perl; | |
use Text::CSV_XS; | |
use Data::Dumper; | |
use File::Path qw/mkpath/; | |
use File::Copy; | |
my %opts; | |
my $container_path = $ARGV[0] or die "Need an output path"; | |
my $output_path = $ARGV[1] or die "Need an output path"; | |
#list of files we're gonna process | |
my @filenames = ('agency.txt', 'routes.txt', 'trips.txt', 'calendar.txt', 'stop_times.txt', 'calendar_dates.txt','stops.txt'); | |
#split GTFS files according to agency | |
my %route_to_agency = (); #route id to agency id mapping | |
my %trip_to_agency = (); #trip id to agency id mapping | |
my %service_to_agency = (); #service id to agency id mapping | |
my %stop_to_agency = (); #stopid to agency - this one contains arrays | |
my $csv_in = Text::CSV_XS->new(); | |
foreach my $filename(@filenames){ | |
my $filepath = join('/', ($container_path, $filename)); | |
open my $fh_in, '<', $filepath or die "can't open $filename"; | |
say "-----\nProcessing $filename\n----"; | |
my $header_line = <$fh_in>; | |
$csv_in->parse($header_line); | |
my $headers = [$csv_in->fields]; | |
while( my $row_line = <$fh_in> ){ | |
$csv_in->parse($row_line); | |
my $row = [$csv_in->fields]; | |
my @agencies = guess_agency($headers, $row, $filename); | |
foreach my $agency(@agencies) { | |
my $target_dir = agency_dir($output_path, $agency); | |
my $output_fh = output_fh($target_dir, $filename, $header_line); | |
print $output_fh $row_line; | |
} | |
} | |
$fh_in->close(); | |
} | |
#just copy feed_info.txt | |
my $feed_info_path = join('/', ($container_path, 'feed_info.txt')); | |
my %agencies_hash = reverse %route_to_agency; | |
foreach my $agency(keys %agencies_hash){ | |
my $target_dir = agency_dir($output_path, $agency); | |
my $agency_feed_info = join '/', ($target_dir, 'feed_info.txt'); | |
copy($feed_info_path, $agency_feed_info); | |
} | |
#warn Dumper(\%stop_to_agency); | |
exit(); | |
sub guess_agency { | |
my ($headers, $row, $filename) = @_; | |
my %record = (); | |
for(my $i=0; $i < scalar(@$headers); $i++){ | |
$record{$headers->[$i]} = $row->[$i]; | |
} | |
given($filename){ | |
when('agency.txt'){ | |
return $record{agency_id}; | |
} | |
when('routes.txt'){ | |
$route_to_agency{$record{route_id}} = $record{agency_id}; | |
return $route_to_agency{$record{route_id}} | |
} | |
when('trips.txt'){ | |
my $route_id = $record{route_id}; | |
my $agency_id = $route_to_agency{$route_id} or die "no agency"; | |
$trip_to_agency{$record{trip_id}} = $agency_id; | |
$service_to_agency{$record{service_id}} = $agency_id; | |
return $agency_id; | |
} | |
when('stop_times.txt'){ | |
my $trip_agency = $trip_to_agency{$record{trip_id}}; | |
my $stop_id = $record{stop_id}; | |
my $existing_agencies = $stop_to_agency{$stop_id} // []; | |
unless($trip_agency ~~ $existing_agencies){ | |
push @$existing_agencies, $trip_agency; | |
$stop_to_agency{$stop_id} = $existing_agencies; | |
} | |
return $trip_agency; | |
} | |
default { | |
#try trip_id, then service | |
if($record{trip_id} && defined $trip_to_agency{$record{trip_id}}){ | |
return $trip_to_agency{$record{trip_id}}; | |
} | |
elsif($record{service_id} && defined $service_to_agency{$record{service_id}}){ | |
return $service_to_agency{$record{service_id}}; | |
} | |
elsif($record{stop_id} && defined $stop_to_agency{$record{stop_id}}){ | |
return @{$stop_to_agency{$record{stop_id}}}; | |
} | |
else{ | |
#warn "can't process record from $filename with details ".Dumper(\%record); | |
warn "can't process record from $filename"; | |
return; | |
} | |
} | |
} | |
} | |
sub agency_dir{ | |
my ($agency, $output_dir) = @_; | |
return join '/', ($agency,$output_dir); | |
} | |
sub output_fh{ | |
my ($dir, $filename, $header_line) = @_; | |
mkpath($dir); #make dir if we don't have it already | |
my $output_path = join('/', ($dir, $filename)); | |
eval{ | |
open my $dummy, '<', $output_path or die "can\'t open $output_path"; | |
}; | |
if($@){ | |
open my $fh, '>', $output_path or die "can\'t open $output_path"; | |
print $fh $header_line; | |
return $fh; | |
} | |
else { | |
open my $fh, '>>', $output_path or die "can\'t open $output_path"; | |
return $fh; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment