Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ninmonkey/e5b9bd5c64b0e159bdaff45e1470cbf0 to your computer and use it in GitHub Desktop.
Save ninmonkey/e5b9bd5c64b0e159bdaff45e1470cbf0 to your computer and use it in GitHub Desktop.
Find archived urls. This method requires no javascript/session/headless browser, etc. 2025/01
using namespace System.Collections.Generic
<#
Using: Pwsh 7
Related:
For parsing HTML with CSS Selector queries, see this template: <https://gist.github.com/ninmonkey/17f6a1f5b28249b6cb2ba8bc746ae4fb>
#>
function FindArchivedUrl {
<#
.SYNOPSIS
Using the deleted urls, lookup the archived urls.
.DESCRIPTION
This method requires no javascript/session/headless browser, etc.
It's a single GET request.
.NOTES
They are using: "org.apache.solr.search"
.EXAMPLE
FindArchivedUrl 'https://www.cdc.gov/coronavirus/2019-ncov/downloads/cases-updates/2021-04-12-all-forecasted-cases-model-data.csv'
#>
[CmdletBinding()]
param(
[ValidateNotNullOrWhiteSpace()]
[Parameter(ValueFromPipeline, Mandatory)]
[Uri[]] $OriginalUrl
)
process {
foreach($cUri in $OriginalUrl ) {
[Uri] $requestMetaUrl = "https://search.cdc.gov/srch/internet_archive/wcms_widget?fl=title,date_archived,date_archived_display,original_url,archive_url,excerpt_txt,cdc_sys_lang_str,cdc_topic_str,cdc_topic_srch&fq=original_url:%22${cUri}%22"
$resp = irm -Uri $requestMetaUrl -SkipHttpErrorCheck -StatusCodeVariable 'vStatus' # -HttpVersion 2.0
"Http status: ${vStatus}" | Write-Verbose
$record = @( $resp.response.docs )[0]
$Info = [ordered]@{
ArchivedUrl = [Uri] $record.archive_url
OriginalUrl = [uri] $record.original_url
Title = $record.title
DateArchived = $record.date_archived
DateArchivedDisplay = $record.date_archived_display
SystemLanguage = $record.cdc_sys_lang_str
SearchTopicStr = $record.cdc_topic_str
SearchTopicSrcH = $record.cdc_topic_srch
# FromObject = $OriginalUrl
Excerpt = if( $record.excerpt_txt.Length -gt 0 ) { $record.excerpt_txt }
else { "`u{2400}" }
}
[pscustomobject]$Info
}
}
}
[Uri] $DeletedUrl = 'https://www.cdc.gov/coronavirus/2019-ncov/downloads/cases-updates/2021-04-12-all-forecasted-cases-model-data.csv'
$Found = FindArchivedUrl $DeletedUrl
# $Urls = [Dictionary[string,Uri]]::new() # nice for experimenting
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment