Created
May 30, 2018 16:23
-
-
Save peaeater/edcfd073067aaaadc4c51906d1e8efe9 to your computer and use it in GitHub Desktop.
Creates sitemap index with attendant sitemaps from a Solr query.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
Create sitemap index with attendant sitemaps from a Solr query. | |
A new sitemap is created every 50,000 rows. | |
#> | |
param ( | |
[string]$ChangeFrequency = "weekly", | |
[string]$IndexBaseUrl = "http://andi.andornot.com/", | |
[string]$Logsrc = "Andi Solr Update", | |
[string]$OutDir = ".\", | |
[string]$PermalinkBaseUrl = "http://andi.andornot.com/en/permalink/", | |
[int]$SolrPageSize = 100, | |
[string]$SolrQuery = "*:*", | |
[string]$SolrUrl = "http://localhost:8983/solr/core1" | |
) | |
function AppendToSitemapFile([string]$file, [psobject]$reply) { | |
if ((test-path $file -PathType Leaf) -ne $true) { | |
Set-Content -Encoding utf8 -Path $file -Value "<?xml version=`"1.0`" encoding=`"UTF-8`"?>`r`n<urlset xmlns=`"http://www.sitemaps.org/schemas/sitemap/0.9`">" | |
} | |
foreach ($doc in $reply.response.docs) { | |
$permalink = "$($PermalinkBaseUrl.trimend("/"))/$($doc.id)" | |
Add-Content -Encoding utf8 -Path $file -Value (FormatSitemapUrlEntry -loc $permalink -lastmod $($doc.created) -changefreq $ChangeFrequency) | |
} | |
} | |
function CalculateCurrentSitemapCount([int]$rowsFetched) { | |
return [Math]::Floor([decimal]($rowsFetched / 50001)) + 1 | |
} | |
function FetchPageFromSolr([string]$url) { | |
try { | |
return (new-object System.Net.WebClient).DownloadString($url) | ConvertFrom-Json | |
} | |
catch { | |
logError $_.Exception | |
} | |
} | |
function FormatIndexFilename() { | |
return [System.IO.Path]::Combine($OutDir, "sitemap_index.xml") | |
} | |
function FormatIndexSitemapEntry([int]$sitemapNumber) { | |
return "`t<sitemap> | |
<loc>$($IndexBaseUrl.TrimEnd("/"))/sitemap$sitemapNumber.xml</loc> | |
<lastmod>$((Get-Date).ToUniversalTime().ToString("yyyy-MM-ddThh:mm:ssZ"))</lastmod> | |
</sitemap>" | |
} | |
function FormatSitemapFilename([int]$sitemapNumber) { | |
return [System.IO.Path]::Combine($OutDir, "sitemap$sitemapNumber.xml") | |
} | |
function FormatSitemapUrlEntry([string]$loc, [string]$lastmod, [string]$changefreq) { | |
return "<url><loc>$loc</loc>$(if ([string]::IsNullOrEmpty($lastmod) -ne $true) {"<lastmod>$lastmod</lastmod>"})<changefreq>$changefreq</changefreq></url>" | |
} | |
function FormatSolrRequest([int]$start) { | |
return "$SolrUrl/select?fl=id,created&q=$SolrQuery&rows=$SolrPageSize&start=$start&wt=json" | |
} | |
function HasNextPage([int]$total, [int]$start, [int]$rows) { | |
return $start + $rows -lt $total | |
} | |
function logError([string]$msg) { | |
# write error msg to Application EventLog | |
Write-EventLog -LogName Application -Source $logsrc -EventId 500 -EntryType Error -Message $msg -Category 0 | |
write-host $msg | |
} | |
function logInfo([string]$msg) { | |
# write info msg to Application EventLog | |
Write-EventLog -LogName Application -Source $logsrc -EventId 200 -EntryType Information -Message $msg -Category 0 | |
write-host $msg | |
} | |
function logWarning([string]$msg) { | |
Write-EventLog -LogName Application -Source $logsrc -EventId 400 -EntryType Warning -Message $msg -Category 0 | |
write-host $msg | |
} | |
<# | |
first page | |
#> | |
# prep params | |
$sitemapCount = 1 | |
$start = 0 | |
# get page response | |
$reply = FetchPageFromSolr (FormatSolrRequest $start) | |
# set vars from response | |
$numFound = $reply.response.numFound | |
$docsCount = $reply.response.docs.Count | |
# exit if page fetch couldn't produce a numFound | |
if ($numFound -eq $null) { | |
exit 1 | |
} | |
# exit if numFound is zero | |
if ($numFound -eq 0) { | |
logWarning "Aborting $IndexBaseUrl sitemap creation. Solr query $SolrQuery found zero results.`r`n$(FormatSolrRequest $start)" | |
exit 1 | |
} | |
# update progress | |
write-progress -activity "Creating sitemap from Solr query..." -status "Processing $start of $numFound." -percentcomplete (($start / $numFound) * 100) | |
# delete old sitemap files | |
Remove-Item -Path ([System.IO.Path]::Combine($OutDir, "*.*")) -Include "sitemap*.xml" | Where-Object { ! $_.PSIsContainer} | |
# create out file | |
AppendToSitemapFile -file (FormatSitemapFilename $sitemapCount) -reply $reply | |
<# | |
further pages | |
#> | |
while (HasNextPage $numFound $start $docsCount) { | |
# prep params | |
$start = $start + $SolrPageSize | |
# get page response | |
$reply = FetchPageFromSolr (FormatSolrRequest $start) | |
# set vars from response | |
$docsCount = $reply.response.docs.Count | |
# update progress | |
write-progress -activity "Creating sitemap from Solr query..." -status "Processing $start of $numFound." -percentcomplete (($start / $numFound) * 100) | |
# append to out file | |
$sitemapCount = CalculateCurrentSitemapCount ($start + $docsCount) | |
AppendToSitemapFile -file (FormatSitemapFilename $sitemapCount) -reply $reply | |
} | |
<# | |
sitemap index and sitemap footers | |
#> | |
# create index file | |
Set-Content -Encoding utf8 -Path (FormatIndexFilename) -Value "<?xml version =`"1.0`" encoding=`"UTF-8`"?>`r`n<sitemapindex xmlns=`"http://www.sitemaps.org/schemas/sitemap/0.9`">" | |
# for each sitemap, append node to index and add closing tag to sitemap files | |
for ($i = 1; $i -le $sitemapCount; $i++) { | |
# append node to index | |
Add-Content -Encoding utf8 -Path (FormatIndexFilename) -Value (FormatIndexSitemapEntry $i) | |
# append sitemap footer | |
Add-Content -Encoding utf8 -Path (FormatSitemapFilename $i) -Value "</urlset>" | |
} | |
# append index footer | |
Add-Content -Encoding utf8 -Path (FormatIndexFilename) -Value "</sitemapindex>" | |
logInfo "Created sitemap for $IndexBaseUrl at $(FormatIndexFilename) from Solr query $SolrQuery." | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment