Created
February 4, 2011 16:02
-
-
Save billdueber/811267 to your computer and use it in GitHub Desktop.
Create a set of simple sitemap files for google to crawl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Then just create a simple XML file pointing to the 50k line files | |
# Don't forget to gzip the files first | |
#!/usr/local/bin/perl | |
my $numfiles = ARGV[0]; # number of files generated before | |
my $urlToSitemapDir = 'http://www.my.machine.edu/dir/for/sitemaps'; | |
print q{<?xml version="1.0" encoding="UTF-8"?> | |
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> | |
}; | |
foreach my $i (0..$numfiles) { | |
print qq{ | |
<sitemap> | |
<loc>$urlToSitemapDir/sitemap_$i.txt.gz</loc> | |
</sitemap> | |
}; | |
} | |
print q{</sitemapindex>}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First, a script to get all the ids and put them into 50K line files | |
#!/usr/local/bin/perl | |
use strict; | |
use warnings; | |
use LWP::Simple; | |
use JSON::XS; | |
use Time::HiRes qw(usleep); | |
my $q = '*:*'; | |
my $baseurl = 'http://my.solr.server/solr' # base path to solr | |
my $recordURL = 'http://my.server.edu/Record/' # append an ID to this to get the record | |
my $fl = 'id'; | |
my $num = 50000; | |
my $start = 0; | |
my $page = 0; | |
my $fq = 'fq=availability:HathiTrust'; | |
my $initalresult = decode_json(get("$baseurl/biblio/selectq=*:*&fl=id&wt=json&json.nl=arrarr\&$fq")); | |
my $total = $initalresult->{response}{numFound}; | |
my $sm = 0; | |
while ($num * $page < $total) { | |
open (OUT, ">sitemap_$sm.txt") or die "Can't open sitemap $sm: $!"; | |
$start = $num * $page; | |
$page++; # for next round | |
my $url = "$baseurl/select?q=$q&rows=$num&fl=$fl&wt=json&json.nl=arrarr&start=$start\&$fq"; | |
my $result = decode_json(get($url)); | |
foreach my $doc (@{$result->{response}{docs}}) { | |
my $id = $doc->{id}; | |
print OUT "$recordURL/$id\n"; | |
} | |
close OUT; | |
$sm++; | |
usleep(2500); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment