Skip to content

Instantly share code, notes, and snippets.

@billdueber
Created February 4, 2011 16:02
Show Gist options
  • Save billdueber/811267 to your computer and use it in GitHub Desktop.
Save billdueber/811267 to your computer and use it in GitHub Desktop.
Create a set of simple sitemap files for google to crawl
# Then just create a simple XML file pointing to the 50k line files
# Don't forget to gzip the files first
#!/usr/local/bin/perl
my $numfiles = ARGV[0]; # number of files generated before
my $urlToSitemapDir = 'http://www.my.machine.edu/dir/for/sitemaps';
print q{<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
};
foreach my $i (0..$numfiles) {
print qq{
<sitemap>
<loc>$urlToSitemapDir/sitemap_$i.txt.gz</loc>
</sitemap>
};
}
print q{</sitemapindex>};
# First, a script to get all the ids and put them into 50K line files
#!/usr/local/bin/perl
use strict;
use warnings;
use LWP::Simple;
use JSON::XS;
use Time::HiRes qw(usleep);
my $q = '*:*';
my $baseurl = 'http://my.solr.server/solr' # base path to solr
my $recordURL = 'http://my.server.edu/Record/' # append an ID to this to get the record
my $fl = 'id';
my $num = 50000;
my $start = 0;
my $page = 0;
my $fq = 'fq=availability:HathiTrust';
my $initalresult = decode_json(get("$baseurl/biblio/selectq=*:*&fl=id&wt=json&json.nl=arrarr\&$fq"));
my $total = $initalresult->{response}{numFound};
my $sm = 0;
while ($num * $page < $total) {
open (OUT, ">sitemap_$sm.txt") or die "Can't open sitemap $sm: $!";
$start = $num * $page;
$page++; # for next round
my $url = "$baseurl/select?q=$q&rows=$num&fl=$fl&wt=json&json.nl=arrarr&start=$start\&$fq";
my $result = decode_json(get($url));
foreach my $doc (@{$result->{response}{docs}}) {
my $id = $doc->{id};
print OUT "$recordURL/$id\n";
}
close OUT;
$sm++;
usleep(2500);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment