Skip to content

Instantly share code, notes, and snippets.

@pwin
Last active December 20, 2015 14:19
Show Gist options
  • Save pwin/6145633 to your computer and use it in GitHub Desktop.
Save pwin/6145633 to your computer and use it in GitHub Desktop.
XQuery site scraper. Using BaseX
xquery version "3.0";
declare namespace xhtml="http://www.w3.org/1999/xhtml";
declare function local:get-page($url){
http:send-request(<http:request method='get' headers='false' status-only='false'/>, $url)
};
declare function local:get-details($n as node()){
for $h in $n/xhtml:div[@class="hit detailed"]
let $title := $h//xhtml:p[@class="summary"]
let $summary := $h//xhtml:p[@class="summary"]
return ()
};
declare function local:parseTable($t){
let $title := $t//xhtml:tr[1]/xhtml:td
let $desc := $t//xhtml:tr[2]/xhtml:td
let $filedetails := $t//xhtml:tr[3]/xhtml:td
let $a := $filedetails//xhtml:a[@class="newWin"]
return
<r>
<title>{$title}</title>
<desc>{$desc}</desc>
<filedetails>{$filedetails}</filedetails>
<doc>{xs:anyURI(concat('http://www.scotland.gov.uk',xs:string($a/@href)))}</doc>
</r>
};
declare function local:ts($u){
typeswitch($u)
case element(xhtml:table) return local:parseTable($u)
default return $u
};
declare function local:build-report($q){
for $i in $q
let $summ := $i/xhtml:p[@class="summary"]
let $title := $i/xhtml:a[@class="title"]
let $referring-page-url := $i/xhtml:a[@class="title"]
let $date := $i/xhtml:span[@class="date"]
let $subject := $i/xhtml:div[@class="extrainfo"]/xhtml:div[1]//xhtml:a
let $doctype := $i/xhtml:div[@class="extrainfo"]/xhtml:div[2]//xhtml:a
let $type := $i/xhtml:div[@class="extrainfo"]/xhtml:div[3]//xhtml:a
let $u1 := xs:anyURI($referring-page-url/@href)
let $u := try {doc($u1)//xhtml:table["dg file"] } catch * {<r><doc>{$u1}</doc></r> }
let $details := local:ts($u)
let $report :=
<report>
<summary>{$summ}</summary>
<t>{$title}</t>
<refPage>{$referring-page-url}</refPage>
<date>{$date}</date>
<subject>{$subject}</subject>
<doctype>{$doctype}</doctype>
<type>{$type}</type>
{$details}
</report>
return $report
};
<out>
{
for $n in (0 to 400)
let $index := xs:integer(concat(xs:string($n),'0'))
let $p :=
local:get-page(concat('http://search1.scotland.gov.uk/Scotland?action=search&amp;q=xls&amp;n=All&amp;%24rcexpanded=false&amp;b=', $index))
let $q := $p[2]//xhtml:div[@id="hits"]//xhtml:div[@class="hit detailed"]
return local:build-report($q)
}
</out>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment