billdueber · February 4, 2011 16:02
diff --git a/sitemapindex.pl b/sitemapindex.pl
 # Then just create a simple XML file pointing to the 50k line files
 # Don't forget to gzip the files first

 #!/usr/local/bin/perl

 my $numfiles = ARGV[0]; # number of files generated before
 my $urlToSitemapDir = 'http://www.my.machine.edu/dir/for/sitemaps';

 print q{<?xml version="1.0" encoding="UTF-8"?>
 <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 };


 foreach my $i (0..$numfiles) {
    print qq{
   <sitemap>
      <loc>$urlToSitemapDir/sitemap_$i.txt.gz</loc>
   </sitemap>
 };
 }

 print q{</sitemapindex>};
diff --git a/solrsitemap.pl b/solrsitemap.pl
 # First, a script to get all the ids and put them into 50K line files

 #!/usr/local/bin/perl

 use strict;
 use warnings;

 use LWP::Simple;
 use JSON::XS;
 use Time::HiRes qw(usleep);

 my $q = '*:*';

 my $baseurl = 'http://my.solr.server/solr' # base path to solr
 my $recordURL = 'http://my.server.edu/Record/' # append an ID to this to get the record

 my $fl = 'id';
 my $num = 50000;
 my $start = 0;
 my $page = 0;
 my $fq = 'fq=availability:HathiTrust';

 my $initalresult = decode_json(get("$baseurl/biblio/selectq=*:*&fl=id&wt=json&json.nl=arrarr\&$fq"));
 my $total = $initalresult->{response}{numFound};

 my $sm = 0;
 while ($num * $page < $total) {
    open (OUT, ">sitemap_$sm.txt") or die "Can't open sitemap $sm: $!";
    $start = $num * $page;
    $page++; # for next round

    my $url = "$baseurl/select?q=$q&rows=$num&fl=$fl&wt=json&json.nl=arrarr&start=$start\&$fq";

    my $result = decode_json(get($url));

    foreach my $doc (@{$result->{response}{docs}}) {
        my $id = $doc->{id};
        print OUT "$recordURL/$id\n";
    }
    close OUT;
    $sm++;
    usleep(2500);
 }
	# Then just create a simple XML file pointing to the 50k line files
	# Don't forget to gzip the files first

	#!/usr/local/bin/perl

	my $numfiles = ARGV[0]; # number of files generated before
	my $urlToSitemapDir = 'http://www.my.machine.edu/dir/for/sitemaps';

	print q{<?xml version="1.0" encoding="UTF-8"?>
	<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	};


	foreach my $i (0..$numfiles) {
	print qq{
	<sitemap>
	<loc>$urlToSitemapDir/sitemap_$i.txt.gz</loc>
	</sitemap>
	};
	}

	print q{</sitemapindex>};
	# First, a script to get all the ids and put them into 50K line files

	#!/usr/local/bin/perl

	use strict;
	use warnings;

	use LWP::Simple;
	use JSON::XS;
	use Time::HiRes qw(usleep);

	my $q = ':';

	my $baseurl = 'http://my.solr.server/solr' # base path to solr
	my $recordURL = 'http://my.server.edu/Record/' # append an ID to this to get the record

	my $fl = 'id';
	my $num = 50000;
	my $start = 0;
	my $page = 0;
	my $fq = 'fq=availability:HathiTrust';

	my $initalresult = decode_json(get("$baseurl/biblio/selectq=:&fl=id&wt=json&json.nl=arrarr\&$fq"));
	my $total = $initalresult->{response}{numFound};

	my $sm = 0;
	while ($num * $page < $total) {
	open (OUT, ">sitemap_$sm.txt") or die "Can't open sitemap $sm: $!";
	$start = $num * $page;
	$page++; # for next round

	my $url = "$baseurl/select?q=$q&rows=$num&fl=$fl&wt=json&json.nl=arrarr&start=$start\&$fq";

	my $result = decode_json(get($url));

	foreach my $doc (@{$result->{response}{docs}}) {
	my $id = $doc->{id};
	print OUT "$recordURL/$id\n";
	}
	close OUT;
	$sm++;
	usleep(2500);
	}