Created
February 1, 2025 17:50
-
-
Save ninmonkey/e5b9bd5c64b0e159bdaff45e1470cbf0 to your computer and use it in GitHub Desktop.
Find archived urls. This method requires no javascript/session/headless browser, etc. 2025/01
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using namespace System.Collections.Generic | |
<# | |
Using: Pwsh 7 | |
Related: | |
For parsing HTML with CSS Selector queries, see this template: <https://gist.github.com/ninmonkey/17f6a1f5b28249b6cb2ba8bc746ae4fb> | |
#> | |
function FindArchivedUrl { | |
<# | |
.SYNOPSIS | |
Using the deleted urls, lookup the archived urls. | |
.DESCRIPTION | |
This method requires no javascript/session/headless browser, etc. | |
It's a single GET request. | |
.NOTES | |
They are using: "org.apache.solr.search" | |
.EXAMPLE | |
FindArchivedUrl 'https://www.cdc.gov/coronavirus/2019-ncov/downloads/cases-updates/2021-04-12-all-forecasted-cases-model-data.csv' | |
#> | |
[CmdletBinding()] | |
param( | |
[ValidateNotNullOrWhiteSpace()] | |
[Parameter(ValueFromPipeline, Mandatory)] | |
[Uri[]] $OriginalUrl | |
) | |
process { | |
foreach($cUri in $OriginalUrl ) { | |
[Uri] $requestMetaUrl = "https://search.cdc.gov/srch/internet_archive/wcms_widget?fl=title,date_archived,date_archived_display,original_url,archive_url,excerpt_txt,cdc_sys_lang_str,cdc_topic_str,cdc_topic_srch&fq=original_url:%22${cUri}%22" | |
$resp = irm -Uri $requestMetaUrl -SkipHttpErrorCheck -StatusCodeVariable 'vStatus' # -HttpVersion 2.0 | |
"Http status: ${vStatus}" | Write-Verbose | |
$record = @( $resp.response.docs )[0] | |
$Info = [ordered]@{ | |
ArchivedUrl = [Uri] $record.archive_url | |
OriginalUrl = [uri] $record.original_url | |
Title = $record.title | |
DateArchived = $record.date_archived | |
DateArchivedDisplay = $record.date_archived_display | |
SystemLanguage = $record.cdc_sys_lang_str | |
SearchTopicStr = $record.cdc_topic_str | |
SearchTopicSrcH = $record.cdc_topic_srch | |
# FromObject = $OriginalUrl | |
Excerpt = if( $record.excerpt_txt.Length -gt 0 ) { $record.excerpt_txt } | |
else { "`u{2400}" } | |
} | |
[pscustomobject]$Info | |
} | |
} | |
} | |
[Uri] $DeletedUrl = 'https://www.cdc.gov/coronavirus/2019-ncov/downloads/cases-updates/2021-04-12-all-forecasted-cases-model-data.csv' | |
$Found = FindArchivedUrl $DeletedUrl | |
# $Urls = [Dictionary[string,Uri]]::new() # nice for experimenting |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment