Created
May 20, 2011 23:03
-
-
Save fxn/983977 to your computer and use it in GitHub Desktop.
Computes the ancestry path of GeoPlanet places
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use warnings; | |
use constant { | |
# Original Yahoo! TSV. | |
GPP => 'geoplanet_places_7.6.0.tsv', | |
# Output, same Yahoo! TSV with an extra ancestry column. | |
ANC => 'geoplanet_places_with_ancestry_7.6.0.tsv' | |
}; | |
# We compute woeid -> ancestor list in this hash. | |
# | |
# We need to deal with about 5.7 million records, so this script | |
# is optimized for speed an memory consumption. In particular this | |
# hash is global and will be modified in place. | |
my %ancestors = (); | |
# Initializes %ancestors with woeid -> [parent_id]. | |
sub initialize_ancestors { | |
open my $gpp_fh, GPP or die $!; | |
while (my $line = <$gpp_fh>) { | |
next unless $line =~ /^\d/; # data records start with a woeid | |
chomp $line; | |
my @fields = split /\t/, $line; | |
my ($woeid, $parent_id) = @fields[0, -1]; | |
$ancestors{$woeid} = [$parent_id]; | |
} | |
} | |
# Once %ancestors is initialized, this subroutine returns a list | |
# with all the ancestors of $start up to the root Earth node. | |
# | |
# Note that the code leverages branches already computed, since | |
# they are also stored in %ancestors. | |
sub branch_up_to_earth { | |
my $start = shift; | |
if ($start == 0) { | |
(); | |
} elsif ($start == 1) { | |
($start); | |
} elsif (@{$ancestors{$start}} > 1) { | |
($start, @{$ancestors{$start}}); | |
} else { | |
($start, branch_up_to_earth($ancestors{$start}[0])); | |
} | |
} | |
# Ouput the original TSV plus the ancestry for each record. | |
sub append_ancestry_to_tsv { | |
open my $gpp_fh, GPP or die $!; | |
open my $anc_fh, '>', ANC or die $!; | |
my $header = <$gpp_fh>; | |
chomp $header; | |
print $anc_fh "$header\tAncestry\n"; | |
while (my $line = <$gpp_fh>) { | |
if ($line =~ /^\d+/) { | |
my $woeid = $&; | |
chomp $line; | |
print $anc_fh "$line\t"; | |
$ancestors{$woeid} = [branch_up_to_earth($ancestors{$woeid}[0])]; | |
print $anc_fh '""', join('/', reverse(@{$ancestors{$woeid}})), '""' if @{$ancestors{$woeid}}; | |
print $anc_fh "\n"; | |
} | |
} | |
} | |
initialize_ancestors; | |
append_ancestry_to_tsv; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment