andrewharvey · July 8, 2011 22:44
diff --git a/planet-replicate-find-next.pl b/planet-replicate-find-next.pl
 #!/usr/bin/perl -w

 # Inspect a local minute-replicate mirror and return the URL of the next diff file

 # To the extent possible under law, the person who associated CC0
 # with this work has waived all copyright and related or neighboring
 # rights to this work.
 # http://creativecommons.org/publicdomain/zero/1.0/

 use strict;
 use warnings;

 use Log::Log4perl qw(:easy);
 Log::Log4perl->easy_init($DEBUG); #DEBUG, INFO, WARN, ERROR, FATAL

 # minute-replicate/A/B/C;

 my $dir = "minute-replicate";

 # open the minute-replicate directory
 opendir(my $dh, $dir) || die("You need to have the $dir directory in your current working directory.\n");

 # find all the sub-directories and get the largest number one
 my @files = readdir($dh); closedir $dh;
 my @files_sorted = sort @files;
 my $largest_A = pop @files_sorted;

 INFO "Largest A found is: $largest_A\n";

 opendir(my $Adh, "$dir/$largest_A") || die;

 # find all the sub-directories and get the largest number one
 my @files_in_A = readdir($Adh); closedir $Adh;
 my @files_in_A_sorted = sort @files_in_A;
 my $largest_B = pop @files_in_A_sorted;

 INFO "Largest B found is: $largest_B\n";

 # next inside this directory, find all sub-files and find the largest number one
 opendir(my $Bdh, "$dir/$largest_A/$largest_B") || die;

 my @files_in_B = grep(/\.osc\.gz$/, readdir($Bdh));
 closedir $Bdh;
 my @files_in_B_sorted = sort @files_in_B;
 my $largest_C = pop @files_in_B_sorted;

 $largest_C =~ /^(\d+).osc.gz/;
 $largest_C = $1;

 INFO "Largest C found is: $largest_C\n";

 # next determine the next expected file
 my $next_A = $largest_A;
 my $next_B;
 my $next_C;

 if ($largest_C == 999) {
  $next_C = 0;
  $next_B = $largest_B + 1;
 }else{
  $next_C = $largest_C + 1;
  $next_B = $largest_B;
 }

 if ($next_B == 1000) {
  $next_A++;
  $next_B = 000;
 }

 print "$dir/".sprintf("%03d", $next_A)."/".sprintf("%03d", $next_B)."/".sprintf("%03d", $next_C).".osc.gz\n";
diff --git a/replicate-fosm-changesets.sh b/replicate-fosm-changesets.sh
 #!/bin/sh

 # Author: Andrew Harvey <[email protected]>
 # License: CC0 http://creativecommons.org/publicdomain/zero/1.0/
 #
 # To the extent possible under law, the person who associated CC0
 # with this work has waived all copyright and related or neighboring
 # rights to this work.

 # This script will replicate changesets pushed to fosm. The recommended way to
 # invoke this script is via something like:
 #    while sleep 2h; do replicate-fosm-changesets.sh; done

 # You can start this script from a blank state, but it may be more efficient to
 # kick start your local copy by running something like (where you replace
 # 1000002000 with a number close to the largest changeset ID in use):
 #    curl -o "head/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]"
 #    curl -o "body/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]/download"

 # We will always get the changeset/id (head) document which gives the changeset
 # tags. We can additionally grab the changeset contents, i.e. the 
 # changeset/id/download (body) document. This is controlled by the
 # DOWNLOAD_BODY variable. If you already have the minutly diffs you probably
 # don't NEED the body as you have that information in your osc diff files.
 DOWNLOAD_BODY=true
 #DOWNLOAD_BODY=

 # Where shall we save the data we download to?
 SAVETO="/data/fosm/api/changeset"

 # make the directories which we will save the data to
 mkdir -p "${SAVETO}/head"
 mkdir -p "${SAVETO}/body"

 # find the last changeset id we have downloaded
 LAST=`ls -1 "$SAVETO/head/" | sed 's/\.gz$//' | sort -n | tail -n 1`

 # if we haven't actually got anything yet we shall start from the lowest fosm
 # changeset id minus 1 (because we increment it later)
 if [ ! $LAST ] ; then
  LAST=$(( 1000000001 - 1 ))
 fi

 # define a function to try to download the next changeset
 tryNext() {
  NEXT=$(( $LAST + 1 ))
  echo "Trying to GET changeset/$NEXT..."
  curl --fail -o "${SAVETO}/head/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT"
  if [ $? -eq 22 ] ; then
    # HTTP page not retrieved
    echo "changeset/$NEXT not found. Exiting, try again later."
  else
    echo "...GOT changeset/$NEXT."

    # compress
    gzip "${SAVETO}/head/${NEXT}"

    if [ $DOWNLOAD_BODY ] ; then
      echo "Trying to GET changeset/$NEXT/download..."
      curl --fail -o "${SAVETO}/body/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT/download"

      if [ $? -eq 22 ] ; then
        echo "We got changeset/$NEXT, but failed to get changeset/$NEXT/download."
        echo "Removing the head and exiting so we can try again later."
        rm -f "${SAVETO}/head/${NEXT}"
        exit 1
      fi

      echo "...GOT changeset/$NEXT/download."

      # compress
      gzip "${SAVETO}/body/${NEXT}"
    fi
    LAST=$NEXT
    echo ""
    tryNext
  fi
 }

 tryNext
diff --git a/replicate-fosm-osm2pgsql.sh b/replicate-fosm-osm2pgsql.sh
 #!/bin/sh

 # To the extent possible under law, the person who associated CC0
 # with this work has waived all copyright and related or neighboring
 # rights to this work.
 # http://creativecommons.org/publicdomain/zero/1.0/

 # This script brings your local fosm minute-replicate mirror up to date with the
 # fosm server. It will keep pulling in changes until it you get up to the same
 # point as the fosm server. Then from your local mirror it will patch your
 # osm2pgsql fosm database with the latest changes.

 # You can either invoke this script via cron and use something like run-one to
 # avoid two instances of this script being run concurrently or,
 # You can just run something like, (possibly with a keep-one-running wrapper)
 # while sleep 120; do replicate-fosm-osm2pgsql.sh; done

 # if you want to manually get an initial chunk of files (eg. to catch up to now) you may want to just use,
 # curl --create-dirs -o minute-replicate/100/#1/#2.osc.gz http://fosm.org/planet/minute-replicate/100/[000-456]/[000-999].osc.gz
 # after you get this initial chunk, you can load the chunk into PostgreSQL in bulk via,
 # osm2pgsql --append --bbox [...] --slim minute-replicate/*/*/*.osc.gz

 SCRIPT_DIR=`dirname $0`

 # add your osm2pgsql arguments here (see man page for osm2pgsql for help)
 OSM2PGSQL_ARGS="--append --slim"

 ##########################
 ## define our functions
 ##########################

 # depending on your tileserver set up (if any) you may wish to expire or dirty
 # cached tiles, my method is used here, but off by default
 # to use my method you need to add the following arguments to OSM2PGSQL_ARGS
 #   --expire-tiles 10-19 --expire-output expired-tiles-list
 # , you also need to add expire-tilecache-disk.pl from
 # https://gist.github.com/1170520 to the same directory as this script
 expire_tiles() {
  # change to =true to turn on expire tiles function
  EXPIRE_TILES=
  if [ $EXPIRE_TILES ] ; then
    echo "expiring tiles"
    # the following script should work for mod_tile/renderd/tirex on disk caches
    # too, but I haven't tested it
    $SCRIPT_DIR/expire-tilecache-disk.pl expired-tiles-list /var/cache/tilecache/YOUR_LAYER/
    EXPIRE_TILES_EXIT_CODE=$?
    if [ $EXPIRE_TILES_EXIT_CODE -ne 0 ] ; then
      echo "failed to expire/dirty tiles ($EXPIRE_TILES_EXIT_CODE)"
      exit 1
    fi
  fi
 }

 # flush out our diff files which were postponed from osm2pgsql
 flush_postponed() {
  if [ -e fosm-diff-postponed ] ; then
    echo "Flushing our backlog of postponed osc.gz files"

    # check we can pass this many arguments into the program on this system
    NUM_DIFF_FILES=`wc -l fosm-diff-postponed | cut -d' ' -f1`
    ARG_MAX=`getconf ARG_MAX`
    if [ $(($NUM_DIFF_FILES + 20)) -gt $ARG_MAX ] ; then # the 20 is a safety net for $OSM2PGSQL_ARGS
      echo "can't flush the backlog: too many postponed diff files to fit in one call to osm2pgsql"
      exit 1
    fi

    cat fosm-diff-postponed | xargs osm2pgsql $OSM2PGSQL_ARGS
    POSTPONED_OSM2PGSQL_EXIT_CODE=$?
    if [ $POSTPONED_OSM2PGSQL_EXIT_CODE -ne 0 ] ; then
      echo "osm2pgsql failed while flushing the backlog, leaving fosm-diff-postponed"
      exit 1
    else
      rm -f fosm-diff-postponed
      expire_tiles
    fi
  fi
 }

 try_next() {
  # find the URL of the next osc file
  NEXT_URL=`$SCRIPT_DIR/planet-replicate-find-next.pl`
  FIND_NEXT_EXIT_CODE=$?
  if [ $FIND_NEXT_EXIT_CODE -ne 0 ] ; then
    echo "planet-replicate-find-next.pl failed ($FIND_NEXT_EXIT_CODE) so we are stopping now also"
    exit $FIND_NEXT_EXIT_CODE
  fi
  
  curl --fail --create-dirs -o "$NEXT_URL" "http://fosm.org/planet/$NEXT_URL"
  CURL_EXIT_CODE=$?
  if [ $CURL_EXIT_CODE -eq 22 ] ; then
    # curl didn't retrieve the file, most likely there are no more osc files yet
    echo "curl $NEXT_URL reached end of osc files ($CURL_EXIT_CODE)"
  elif [ $CURL_EXIT_CODE -ne 0 ] ; then
    # curl failed to get file, something went wrong
    echo "curl $NEXT_URL failed ($CURL_EXIT_CODE)"
  else
    echo "GOT $NEXT_URL"
    if [ $POSTPONE ] ; then
      echo "$NEXT_URL" >> fosm-diff-postponed
      try_next
    else
      osm2pgsql $OSM2PGSQL_ARGS "$NEXT_URL"
      OSM2PGSQL_EXIT_CODE=$?
      if [ $OSM2PGSQL_EXIT_CODE -ne 0 ] ; then
        echo "osm2pgsql failed for $NEXT_URL ($OSM2PGSQL_EXIT_CODE)"
        if [ $OSM2PGSQL_EXIT_CODE -eq 137 ] ; then
          echo "   osm2pgsql received the KILL signal, probably not enough memory"
        fi
        exit $OSM2PGSQL_EXIT_CODE
      else
        try_next
      fi
    fi
  fi
 }

 ##########################
 ## main function
 ##########################

 # run the flush before we start in case we didn't cleanly finish last time
 flush_postponed

 if [ "$1" = "--postpone" ] ; then
  # withhold loading downloaded files into osm2pgsql right now
  # instead, add the file we'll download to a backlog list
  POSTPONE=true
 else
  # load each osc file into postgres via osm2pgsql individually as soon as we
  # have downloaded the osc file
  POSTPONE=
 fi

 # try to download the next osc file and either load it into postgres or add it
 # to a postponed list
 try_next

 # clear out the postponed list by loading all osc files into postgres
 flush_postponed
	#!/usr/bin/perl -w

	# Inspect a local minute-replicate mirror and return the URL of the next diff file

	# To the extent possible under law, the person who associated CC0
	# with this work has waived all copyright and related or neighboring
	# rights to this work.
	# http://creativecommons.org/publicdomain/zero/1.0/

	use strict;
	use warnings;

	use Log::Log4perl qw(:easy);
	Log::Log4perl->easy_init($DEBUG); #DEBUG, INFO, WARN, ERROR, FATAL

	# minute-replicate/A/B/C;

	my $dir = "minute-replicate";

	# open the minute-replicate directory
	opendir(my $dh, $dir) \|\| die("You need to have the $dir directory in your current working directory.\n");

	# find all the sub-directories and get the largest number one
	my @files = readdir($dh); closedir $dh;
	my @files_sorted = sort @files;
	my $largest_A = pop @files_sorted;

	INFO "Largest A found is: $largest_A\n";

	opendir(my $Adh, "$dir/$largest_A") \|\| die;

	# find all the sub-directories and get the largest number one
	my @files_in_A = readdir($Adh); closedir $Adh;
	my @files_in_A_sorted = sort @files_in_A;
	my $largest_B = pop @files_in_A_sorted;

	INFO "Largest B found is: $largest_B\n";

	# next inside this directory, find all sub-files and find the largest number one
	opendir(my $Bdh, "$dir/$largest_A/$largest_B") \|\| die;

	my @files_in_B = grep(/\.osc\.gz$/, readdir($Bdh));
	closedir $Bdh;
	my @files_in_B_sorted = sort @files_in_B;
	my $largest_C = pop @files_in_B_sorted;

	$largest_C =~ /^(\d+).osc.gz/;
	$largest_C = $1;

	INFO "Largest C found is: $largest_C\n";

	# next determine the next expected file
	my $next_A = $largest_A;
	my $next_B;
	my $next_C;

	if ($largest_C == 999) {
	$next_C = 0;
	$next_B = $largest_B + 1;
	}else{
	$next_C = $largest_C + 1;
	$next_B = $largest_B;
	}

	if ($next_B == 1000) {
	$next_A++;
	$next_B = 000;
	}

	print "$dir/".sprintf("%03d", $next_A)."/".sprintf("%03d", $next_B)."/".sprintf("%03d", $next_C).".osc.gz\n";
	#!/bin/sh

	# Author: Andrew Harvey <[email protected]>
	# License: CC0 http://creativecommons.org/publicdomain/zero/1.0/
	#
	# To the extent possible under law, the person who associated CC0
	# with this work has waived all copyright and related or neighboring
	# rights to this work.

	# This script will replicate changesets pushed to fosm. The recommended way to
	# invoke this script is via something like:
	# while sleep 2h; do replicate-fosm-changesets.sh; done

	# You can start this script from a blank state, but it may be more efficient to
	# kick start your local copy by running something like (where you replace
	# 1000002000 with a number close to the largest changeset ID in use):
	# curl -o "head/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]"
	# curl -o "body/#1" "http://api.fosm.org/api/0.6/changeset/[1000000001-1000002000]/download"

	# We will always get the changeset/id (head) document which gives the changeset
	# tags. We can additionally grab the changeset contents, i.e. the
	# changeset/id/download (body) document. This is controlled by the
	# DOWNLOAD_BODY variable. If you already have the minutly diffs you probably
	# don't NEED the body as you have that information in your osc diff files.
	DOWNLOAD_BODY=true
	#DOWNLOAD_BODY=

	# Where shall we save the data we download to?
	SAVETO="/data/fosm/api/changeset"

	# make the directories which we will save the data to
	mkdir -p "${SAVETO}/head"
	mkdir -p "${SAVETO}/body"

	# find the last changeset id we have downloaded
	LAST=`ls -1 "$SAVETO/head/" \| sed 's/\.gz$//' \| sort -n \| tail -n 1`

	# if we haven't actually got anything yet we shall start from the lowest fosm
	# changeset id minus 1 (because we increment it later)
	if [ ! $LAST ] ; then
	LAST=$(( 1000000001 - 1 ))
	fi

	# define a function to try to download the next changeset
	tryNext() {
	NEXT=$(( $LAST + 1 ))
	echo "Trying to GET changeset/$NEXT..."
	curl --fail -o "${SAVETO}/head/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT"
	if [ $? -eq 22 ] ; then
	# HTTP page not retrieved
	echo "changeset/$NEXT not found. Exiting, try again later."
	else
	echo "...GOT changeset/$NEXT."

	# compress
	gzip "${SAVETO}/head/${NEXT}"

	if [ $DOWNLOAD_BODY ] ; then
	echo "Trying to GET changeset/$NEXT/download..."
	curl --fail -o "${SAVETO}/body/${NEXT}" "http://api.fosm.org/api/0.6/changeset/$NEXT/download"

	if [ $? -eq 22 ] ; then
	echo "We got changeset/$NEXT, but failed to get changeset/$NEXT/download."
	echo "Removing the head and exiting so we can try again later."
	rm -f "${SAVETO}/head/${NEXT}"
	exit 1
	fi

	echo "...GOT changeset/$NEXT/download."

	# compress
	gzip "${SAVETO}/body/${NEXT}"
	fi
	LAST=$NEXT
	echo ""
	tryNext
	fi
	}

	tryNext