Created
July 8, 2011 22:44
-
-
Save andrewharvey/1073001 to your computer and use it in GitHub Desktop.
Keeps a local minute-replicate mirror of fosm.org osc files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# Inspect a local minute-replicate mirror and return the URL of the next diff file | |
# To the extent possible under law, the person who associated CC0 | |
# with this work has waived all copyright and related or neighboring | |
# rights to this work. | |
# http://creativecommons.org/publicdomain/zero/1.0/ | |
use strict; | |
use warnings; | |
use Log::Log4perl qw(:easy); | |
Log::Log4perl->easy_init($DEBUG); #DEBUG, INFO, WARN, ERROR, FATAL | |
# minute-replicate/A/B/C; | |
my $dir = "minute-replicate"; | |
# open the minute-replicate directory | |
opendir(my $dh, $dir) || die("You need to have the $dir directory in your current working directory.\n"); | |
# find all the sub-directories and get the largest number one | |
my @files = readdir($dh); closedir $dh; | |
my @files_sorted = sort @files; | |
my $largest_A = pop @files_sorted; | |
INFO "Largest A found is: $largest_A\n"; | |
opendir(my $Adh, "$dir/$largest_A") || die; | |
# find all the sub-directories and get the largest number one | |
my @files_in_A = readdir($Adh); closedir $Adh; | |
my @files_in_A_sorted = sort @files_in_A; | |
my $largest_B = pop @files_in_A_sorted; | |
INFO "Largest B found is: $largest_B\n"; | |
# next inside this directory, find all sub-files and find the largest number one | |
opendir(my $Bdh, "$dir/$largest_A/$largest_B") || die; | |
my @files_in_B = grep(/\.osc\.gz$/, readdir($Bdh)); | |
closedir $Bdh; | |
my @files_in_B_sorted = sort @files_in_B; | |
my $largest_C = pop @files_in_B_sorted; | |
$largest_C =~ /^(\d+).osc.gz/; | |
$largest_C = $1; | |
INFO "Largest C found is: $largest_C\n"; | |
# next determine the next expected file | |
my $next_A = $largest_A; | |
my $next_B; | |
my $next_C; | |
if ($largest_C == 999) { | |
$next_C = 0; | |
$next_B = $largest_B + 1; | |
}else{ | |
$next_C = $largest_C + 1; | |
$next_B = $largest_B; | |
} | |
if ($next_B == 1000) { | |
$next_A++; | |
$next_B = 000; | |
} | |
print "$dir/".sprintf("%03d", $next_A)."/".sprintf("%03d", $next_B)."/".sprintf("%03d", $next_C).".osc.gz\n"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Author: Andrew Harvey <[email protected]> | |
# License: CC0 http://creativecommons.org/publicdomain/zero/1.0/ | |
# | |
# To the extent possible under law, the person who associated CC0 | |
# with this work has waived all copyright and related or neighboring | |
# rights to this work. | |
# This script will replicate changesets pushed to fosm. The recommended way to | |
# invoke this script is via something like: | |
# while sleep 2h; do replicate-fosm-changesets.sh; done | |
# You can start this script from a blank state, but it may be more efficient to | |
# kick start your local copy by running something like (where you replace | |
# 1000002000 with a number close to the largest changeset ID in use): | |
# curl -o "head/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]" | |
# curl -o "body/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]/download" | |
# We will always get the changeset/id (head) document which gives the changeset | |
# tags. We can additionally grab the changeset contents, i.e. the | |
# changeset/id/download (body) document. This is controlled by the | |
# DOWNLOAD_BODY variable. If you already have the minutly diffs you probably | |
# don't NEED the body as you have that information in your osc diff files. | |
DOWNLOAD_BODY=true | |
#DOWNLOAD_BODY= | |
# Where shall we save the data we download to? | |
SAVETO="/data/fosm/api/changeset" | |
# make the directories which we will save the data to | |
mkdir -p "${SAVETO}/head" | |
mkdir -p "${SAVETO}/body" | |
# find the last changeset id we have downloaded | |
LAST=`ls -1 "$SAVETO/head/" | sed 's/\.gz$//' | sort -n | tail -n 1` | |
# if we haven't actually got anything yet we shall start from the lowest fosm | |
# changeset id minus 1 (because we increment it later) | |
if [ ! $LAST ] ; then | |
LAST=$(( 1000000001 - 1 )) | |
fi | |
# define a function to try to download the next changeset | |
tryNext() { | |
NEXT=$(( $LAST + 1 )) | |
echo "Trying to GET changeset/$NEXT..." | |
curl --fail -o "${SAVETO}/head/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT" | |
if [ $? -eq 22 ] ; then | |
# HTTP page not retrieved | |
echo "changeset/$NEXT not found. Exiting, try again later." | |
else | |
echo "...GOT changeset/$NEXT." | |
# compress | |
gzip "${SAVETO}/head/${NEXT}" | |
if [ $DOWNLOAD_BODY ] ; then | |
echo "Trying to GET changeset/$NEXT/download..." | |
curl --fail -o "${SAVETO}/body/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT/download" | |
if [ $? -eq 22 ] ; then | |
echo "We got changeset/$NEXT, but failed to get changeset/$NEXT/download." | |
echo "Removing the head and exiting so we can try again later." | |
rm -f "${SAVETO}/head/${NEXT}" | |
exit 1 | |
fi | |
echo "...GOT changeset/$NEXT/download." | |
# compress | |
gzip "${SAVETO}/body/${NEXT}" | |
fi | |
LAST=$NEXT | |
echo "" | |
tryNext | |
fi | |
} | |
tryNext |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# To the extent possible under law, the person who associated CC0 | |
# with this work has waived all copyright and related or neighboring | |
# rights to this work. | |
# http://creativecommons.org/publicdomain/zero/1.0/ | |
# This script brings your local fosm minute-replicate mirror up to date with the | |
# fosm server. It will keep pulling in changes until it you get up to the same | |
# point as the fosm server. Then from your local mirror it will patch your | |
# osm2pgsql fosm database with the latest changes. | |
# You can either invoke this script via cron and use something like run-one to | |
# avoid two instances of this script being run concurrently or, | |
# You can just run something like, (possibly with a keep-one-running wrapper) | |
# while sleep 120; do replicate-fosm-osm2pgsql.sh; done | |
# if you want to manually get an initial chunk of files (eg. to catch up to now) you may want to just use, | |
# curl --create-dirs -o minute-replicate/100/#1/#2.osc.gz http://fosm.org/planet/minute-replicate/100/[000-456]/[000-999].osc.gz | |
# after you get this initial chunk, you can load the chunk into PostgreSQL in bulk via, | |
# osm2pgsql --append --bbox [...] --slim minute-replicate/*/*/*.osc.gz | |
SCRIPT_DIR=`dirname $0` | |
# add your osm2pgsql arguments here (see man page for osm2pgsql for help) | |
OSM2PGSQL_ARGS="--append --slim" | |
########################## | |
## define our functions | |
########################## | |
# depending on your tileserver set up (if any) you may wish to expire or dirty | |
# cached tiles, my method is used here, but off by default | |
# to use my method you need to add the following arguments to OSM2PGSQL_ARGS | |
# --expire-tiles 10-19 --expire-output expired-tiles-list | |
# , you also need to add expire-tilecache-disk.pl from | |
# https://gist.github.com/1170520 to the same directory as this script | |
expire_tiles() { | |
# change to =true to turn on expire tiles function | |
EXPIRE_TILES= | |
if [ $EXPIRE_TILES ] ; then | |
echo "expiring tiles" | |
# the following script should work for mod_tile/renderd/tirex on disk caches | |
# too, but I haven't tested it | |
$SCRIPT_DIR/expire-tilecache-disk.pl expired-tiles-list /var/cache/tilecache/YOUR_LAYER/ | |
EXPIRE_TILES_EXIT_CODE=$? | |
if [ $EXPIRE_TILES_EXIT_CODE -ne 0 ] ; then | |
echo "failed to expire/dirty tiles ($EXPIRE_TILES_EXIT_CODE)" | |
exit 1 | |
fi | |
fi | |
} | |
# flush out our diff files which were postponed from osm2pgsql | |
flush_postponed() { | |
if [ -e fosm-diff-postponed ] ; then | |
echo "Flushing our backlog of postponed osc.gz files" | |
# check we can pass this many arguments into the program on this system | |
NUM_DIFF_FILES=`wc -l fosm-diff-postponed | cut -d' ' -f1` | |
ARG_MAX=`getconf ARG_MAX` | |
if [ $(($NUM_DIFF_FILES + 20)) -gt $ARG_MAX ] ; then # the 20 is a safety net for $OSM2PGSQL_ARGS | |
echo "can't flush the backlog: too many postponed diff files to fit in one call to osm2pgsql" | |
exit 1 | |
fi | |
cat fosm-diff-postponed | xargs osm2pgsql $OSM2PGSQL_ARGS | |
POSTPONED_OSM2PGSQL_EXIT_CODE=$? | |
if [ $POSTPONED_OSM2PGSQL_EXIT_CODE -ne 0 ] ; then | |
echo "osm2pgsql failed while flushing the backlog, leaving fosm-diff-postponed" | |
exit 1 | |
else | |
rm -f fosm-diff-postponed | |
expire_tiles | |
fi | |
fi | |
} | |
try_next() { | |
# find the URL of the next osc file | |
NEXT_URL=`$SCRIPT_DIR/planet-replicate-find-next.pl` | |
FIND_NEXT_EXIT_CODE=$? | |
if [ $FIND_NEXT_EXIT_CODE -ne 0 ] ; then | |
echo "planet-replicate-find-next.pl failed ($FIND_NEXT_EXIT_CODE) so we are stopping now also" | |
exit $FIND_NEXT_EXIT_CODE | |
fi | |
curl --fail --create-dirs -o "$NEXT_URL" "http://fosm.org/planet/$NEXT_URL" | |
CURL_EXIT_CODE=$? | |
if [ $CURL_EXIT_CODE -eq 22 ] ; then | |
# curl didn't retrieve the file, most likely there are no more osc files yet | |
echo "curl $NEXT_URL reached end of osc files ($CURL_EXIT_CODE)" | |
elif [ $CURL_EXIT_CODE -ne 0 ] ; then | |
# curl failed to get file, something went wrong | |
echo "curl $NEXT_URL failed ($CURL_EXIT_CODE)" | |
else | |
echo "GOT $NEXT_URL" | |
if [ $POSTPONE ] ; then | |
echo "$NEXT_URL" >> fosm-diff-postponed | |
try_next | |
else | |
osm2pgsql $OSM2PGSQL_ARGS "$NEXT_URL" | |
OSM2PGSQL_EXIT_CODE=$? | |
if [ $OSM2PGSQL_EXIT_CODE -ne 0 ] ; then | |
echo "osm2pgsql failed for $NEXT_URL ($OSM2PGSQL_EXIT_CODE)" | |
if [ $OSM2PGSQL_EXIT_CODE -eq 137 ] ; then | |
echo " osm2pgsql received the KILL signal, probably not enough memory" | |
fi | |
exit $OSM2PGSQL_EXIT_CODE | |
else | |
try_next | |
fi | |
fi | |
fi | |
} | |
########################## | |
## main function | |
########################## | |
# run the flush before we start in case we didn't cleanly finish last time | |
flush_postponed | |
if [ "$1" = "--postpone" ] ; then | |
# withhold loading downloaded files into osm2pgsql right now | |
# instead, add the file we'll download to a backlog list | |
POSTPONE=true | |
else | |
# load each osc file into postgres via osm2pgsql individually as soon as we | |
# have downloaded the osc file | |
POSTPONE= | |
fi | |
# try to download the next osc file and either load it into postgres or add it | |
# to a postponed list | |
try_next | |
# clear out the postponed list by loading all osc files into postgres | |
flush_postponed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment