Created
February 9, 2012 16:43
-
-
Save kardeiz/1781007 to your computer and use it in GitHub Desktop.
powershell script to harvest all DC metadata from OAI server. returns xml and csv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# define the oai server and base request string | |
$baseurl = "http://digitalrepository.smu.edu/cgi/oai2.cgi/OAI-script?" | |
$payload = "verb=ListRecords&metadataPrefix=oai_dc" | |
# set up the webclient to grab the xml. some servers reject oai-pmh requests with no user-agent specified | |
$wc = New-Object System.Net.WebClient | |
$wc.Encoding = [Text.Encoding]::UTF8 | |
$wc.Headers.add("User-Agent", "PowerShell Script") | |
# set the first request and load the xml into variable. note: this assumes server response is xml | |
$url = $baseurl + $payload | |
[xml]$oaicont = $wc.downloadstring($url) | |
# define the node containing all records. we'll append records from additional pages to this node | |
$apnode = $oaicont.selectsinglenode("/*/*[local-name()=`"ListRecords`"]") | |
#find the resumptionToken | |
$rt = $oaicont.selectsinglenode("/*/*[local-name()=`"ListRecords`"]/*[local-name()=`"resumptionToken`"]") | |
# do this while found pages have resumptionTokens | |
while ($rt.haschildnodes -eq $true) | |
{ | |
# add a timeout so the OAI server doesn't get mad | |
Start-Sleep -s 10 | |
# set new URL request based on rt, load xml, and get new resumptionToken | |
$url = $baseurl + "verb=ListRecords&resumptionToken=" + $rt.innertext | |
$wc.Headers.add("User-Agent", "PowerShell Script") | |
[xml]$oainext = $wc.downloadstring($url) | |
$rt = $oainext.selectsinglenode("/*/*[local-name()=`"ListRecords`"]/*[local-name()=`"resumptionToken`"]") | |
# get all additional records and append them to the apnode | |
$innerel = $oainext.selectnodes("/*/*[local-name()=`"ListRecords`"]/*") | |
foreach ($inone in $innerel) | |
{ | |
$inone = $oaicont.importnode($inone, $true) | |
$apnode.appendchild($inone) | |
} | |
} | |
# build a pretty printing xml writer | |
$xws = new-object system.xml.XmlWriterSettings | |
$xws.Indent = $true | |
$xws.indentchars = "`t" | |
$xtw = [system.xml.XmlWriter]::create("path to output xml", $xws) | |
$oaicont.WriteContentTo($xtw) | |
$xtw.flush() | |
$xtw.dispose() | |
# transform this xml file to csv using the | |
$xslt = New-Object System.Xml.Xsl.XslCompiledTransform | |
$xslt.load( "path to stylesheet, e.g., oai2csv.xsl", [System.Xml.Xsl.XsltSettings]::TrustedXslt, (New-Object System.Xml.XmlUrlResolver) ) | |
$xslt.Transform( "path to output xml", "path to output csv" ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment