Skip to content

Instantly share code, notes, and snippets.

@peaeater
Created May 30, 2018 16:23
Show Gist options
  • Save peaeater/edcfd073067aaaadc4c51906d1e8efe9 to your computer and use it in GitHub Desktop.
Save peaeater/edcfd073067aaaadc4c51906d1e8efe9 to your computer and use it in GitHub Desktop.
Creates sitemap index with attendant sitemaps from a Solr query.
<#
Create sitemap index with attendant sitemaps from a Solr query.
A new sitemap is created every 50,000 rows.
#>
param (
[string]$ChangeFrequency = "weekly",
[string]$IndexBaseUrl = "http://andi.andornot.com/",
[string]$Logsrc = "Andi Solr Update",
[string]$OutDir = ".\",
[string]$PermalinkBaseUrl = "http://andi.andornot.com/en/permalink/",
[int]$SolrPageSize = 100,
[string]$SolrQuery = "*:*",
[string]$SolrUrl = "http://localhost:8983/solr/core1"
)
function AppendToSitemapFile([string]$file, [psobject]$reply) {
if ((test-path $file -PathType Leaf) -ne $true) {
Set-Content -Encoding utf8 -Path $file -Value "<?xml version=`"1.0`" encoding=`"UTF-8`"?>`r`n<urlset xmlns=`"http://www.sitemaps.org/schemas/sitemap/0.9`">"
}
foreach ($doc in $reply.response.docs) {
$permalink = "$($PermalinkBaseUrl.trimend("/"))/$($doc.id)"
Add-Content -Encoding utf8 -Path $file -Value (FormatSitemapUrlEntry -loc $permalink -lastmod $($doc.created) -changefreq $ChangeFrequency)
}
}
function CalculateCurrentSitemapCount([int]$rowsFetched) {
return [Math]::Floor([decimal]($rowsFetched / 50001)) + 1
}
function FetchPageFromSolr([string]$url) {
try {
return (new-object System.Net.WebClient).DownloadString($url) | ConvertFrom-Json
}
catch {
logError $_.Exception
}
}
function FormatIndexFilename() {
return [System.IO.Path]::Combine($OutDir, "sitemap_index.xml")
}
function FormatIndexSitemapEntry([int]$sitemapNumber) {
return "`t<sitemap>
<loc>$($IndexBaseUrl.TrimEnd("/"))/sitemap$sitemapNumber.xml</loc>
<lastmod>$((Get-Date).ToUniversalTime().ToString("yyyy-MM-ddThh:mm:ssZ"))</lastmod>
</sitemap>"
}
function FormatSitemapFilename([int]$sitemapNumber) {
return [System.IO.Path]::Combine($OutDir, "sitemap$sitemapNumber.xml")
}
function FormatSitemapUrlEntry([string]$loc, [string]$lastmod, [string]$changefreq) {
return "<url><loc>$loc</loc>$(if ([string]::IsNullOrEmpty($lastmod) -ne $true) {"<lastmod>$lastmod</lastmod>"})<changefreq>$changefreq</changefreq></url>"
}
function FormatSolrRequest([int]$start) {
return "$SolrUrl/select?fl=id,created&q=$SolrQuery&rows=$SolrPageSize&start=$start&wt=json"
}
function HasNextPage([int]$total, [int]$start, [int]$rows) {
return $start + $rows -lt $total
}
function logError([string]$msg) {
# write error msg to Application EventLog
Write-EventLog -LogName Application -Source $logsrc -EventId 500 -EntryType Error -Message $msg -Category 0
write-host $msg
}
function logInfo([string]$msg) {
# write info msg to Application EventLog
Write-EventLog -LogName Application -Source $logsrc -EventId 200 -EntryType Information -Message $msg -Category 0
write-host $msg
}
function logWarning([string]$msg) {
Write-EventLog -LogName Application -Source $logsrc -EventId 400 -EntryType Warning -Message $msg -Category 0
write-host $msg
}
<#
first page
#>
# prep params
$sitemapCount = 1
$start = 0
# get page response
$reply = FetchPageFromSolr (FormatSolrRequest $start)
# set vars from response
$numFound = $reply.response.numFound
$docsCount = $reply.response.docs.Count
# exit if page fetch couldn't produce a numFound
if ($numFound -eq $null) {
exit 1
}
# exit if numFound is zero
if ($numFound -eq 0) {
logWarning "Aborting $IndexBaseUrl sitemap creation. Solr query $SolrQuery found zero results.`r`n$(FormatSolrRequest $start)"
exit 1
}
# update progress
write-progress -activity "Creating sitemap from Solr query..." -status "Processing $start of $numFound." -percentcomplete (($start / $numFound) * 100)
# delete old sitemap files
Remove-Item -Path ([System.IO.Path]::Combine($OutDir, "*.*")) -Include "sitemap*.xml" | Where-Object { ! $_.PSIsContainer}
# create out file
AppendToSitemapFile -file (FormatSitemapFilename $sitemapCount) -reply $reply
<#
further pages
#>
while (HasNextPage $numFound $start $docsCount) {
# prep params
$start = $start + $SolrPageSize
# get page response
$reply = FetchPageFromSolr (FormatSolrRequest $start)
# set vars from response
$docsCount = $reply.response.docs.Count
# update progress
write-progress -activity "Creating sitemap from Solr query..." -status "Processing $start of $numFound." -percentcomplete (($start / $numFound) * 100)
# append to out file
$sitemapCount = CalculateCurrentSitemapCount ($start + $docsCount)
AppendToSitemapFile -file (FormatSitemapFilename $sitemapCount) -reply $reply
}
<#
sitemap index and sitemap footers
#>
# create index file
Set-Content -Encoding utf8 -Path (FormatIndexFilename) -Value "<?xml version =`"1.0`" encoding=`"UTF-8`"?>`r`n<sitemapindex xmlns=`"http://www.sitemaps.org/schemas/sitemap/0.9`">"
# for each sitemap, append node to index and add closing tag to sitemap files
for ($i = 1; $i -le $sitemapCount; $i++) {
# append node to index
Add-Content -Encoding utf8 -Path (FormatIndexFilename) -Value (FormatIndexSitemapEntry $i)
# append sitemap footer
Add-Content -Encoding utf8 -Path (FormatSitemapFilename $i) -Value "</urlset>"
}
# append index footer
Add-Content -Encoding utf8 -Path (FormatIndexFilename) -Value "</sitemapindex>"
logInfo "Created sitemap for $IndexBaseUrl at $(FormatIndexFilename) from Solr query $SolrQuery."
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment