Skip to content

Instantly share code, notes, and snippets.

@lueck
Created February 10, 2021 20:48
Show Gist options
  • Select an option

  • Save lueck/d7e47d8b481d13b93d2af5213bb8d1de to your computer and use it in GitHub Desktop.

Select an option

Save lueck/d7e47d8b481d13b93d2af5213bb8d1de to your computer and use it in GitHub Desktop.
XSLT for scraping HTML, tolerant against tag soup
<?xml version="1.0" encoding="UTF-8"?>
<!--
How old were poets, when they were nobel praised?
This is an example of data aquisition from HTML using XSLT.
It make use of html2xml.xsl
USAGE:
java -jar saxon.jar -xsl:alternobel.xsl -it:start infile=https://de.wikipedia.org/wiki/Liste_der_Nobelpreistr%C3%A4ger_f%C3%BCr_Literatur
-->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:scdh="http://www.wwu.de/scdh#data-aquisition"
xmlns="ad.hoc"
exclude-result-prefixes="xs scdh"
version="3.0">
<!-- Import von html2xml.xsl -->
<xsl:include href="html2xml.xsl"/>
<!-- body ist das Root-Element -->
<xsl:template match="/body">
<nobel>
<!-- Wähle alle table-Element, für die gilt: Es gibt auf
der Achse der Nachfahren (d.h. Kind-Elemente und
Kind-Elemente der Kind-Element usw.) ein tr-Element,
für das gilt: es hat 5 td-Kind-Elemente. Wähle von
diesen table-Elementen die tr-Elemente in beliebiger
Tiefe, für die gilt: der Inhalt des ersten
td-Kind-Element ist eine Zahl aus arabischen Ziffern.
-->
<xsl:apply-templates select="//table[descendant::tr[count(td) eq 5]]//tr[matches(td[1], '[0-9]+')]"/>
</nobel>
</xsl:template>
<!-- Verarbeitung der oben ausgewählten tr-Elemente -->
<xsl:template match="tr">
<verleihung>
<!-- extrahiere das Geburtsjahr -->
<xsl:variable name="born" select="number(normalize-space(replace(string-join(td[2]/text(), ''), '\((\*\s*)?([0-9]{4,4})[–)]+.*', '$2')))"/>
<!-- extrahiere das Jahr der Preisverleihung -->
<xsl:variable name="year" select="number(normalize-space(string-join(td[1]/text(), '')))"/>
<!-- Ausgabe der neuen XML-Elemente -->
<year><xsl:value-of select="$year"/></year>
<name><xsl:value-of select="td[2]/a[1]"/></name>
<born><xsl:value-of select="$born"/></born>
<age><xsl:value-of select="$year - $born"/></age>
</verleihung>
</xsl:template>
</xsl:stylesheet>
<?xml version="1.0" encoding="UTF-8"?>
<!--
html2xml.xsl - XSLT for parsing HTML as if it was X(HT)ML
USAGE:
java -jar saxon.jar -xsl:html2xml.xsl -it:start [parsed=(true|false)] infile=<URL>
The input file is not given as a source file to be read by saxon's
XML parser. Instead, it is given as a stylesheet parameter called "infile"
and is read as unparsed text in a first step. Everything but the html body
is drained then, and some well known non-wellformed elements are drained,
too. First then the resulting string is parsed with the XSLT engine's XML
parser. Instead of starting with a source file's document root, the XSLT
engine starts with a named template called "start".
With the stylesheet parameter parsed=false XML parsing by the XSLT engine
can be switched off in order to inspect the string feed to the parser. Note,
that you would have to set the output method to "text" if you want to have
real tags then.
The stylesheet is to be reused with <xsl:include> or <xsl:import>. A template
matching the XPath expression "/body" can be used as a starting point.
The stylesheet loads definitions for named entities from the web, because
named entities must be given explicitely in XML.
There is also a function named scdh:html-body($url as xs:string) that returns
the body of a referenced html document as a string with tag soup drained. You
can use it together with parse-xml() like doc($uri as:string?):
parse-xml(scdh:html-body($url))/path/to/valuable/data
NOTE:
If you get a parser error with this stylesheet, you probably will be better
off using tidy (https://www.html-tidy.org), to clean up the tag soup and
then use XSLT.
EXAMPLES:
java saxon-he.jar -xsl:html2xml.xsl -it:start infile="https://de.wikipedia.org/wiki/Liste_der_Nobelpreistr%C3%A4ger_f%C3%BCr_Literatur"
java saxon-he.jar -xsl:html2xml.xsl -it:start infile=my-local.html parsed=false
REQUIRED: saxon >= 9.6
(c) Christian Lück, 2021. LICENSE: MIT
-->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:scdh="http://www.wwu.de/scdh#data-aquisition"
exclude-result-prefixes="xs scdh"
version="3.0">
<xsl:output method="xml" indent="yes"/>
<!-- url of infile -->
<xsl:param name="infile" as="xs:string" required="yes"/>
<!-- Note, that you would have to reset output-method to text to get real tags. -->
<xsl:param name="parsed" as="xs:boolean" required="false" select="true()"/>
<xsl:template name="entities">
&lt;!DOCTYPE body [ &lt;!ENTITY % w3centities-f PUBLIC
"-//W3C//ENTITIES Combined Set//EN//XML"
"http://www.w3.org/2003/entities/2007/w3centities-f.ent" >
%w3centities-f;
]>
</xsl:template>
<!-- the drain is where the tag soup goes down -->
<xsl:function name="scdh:drain" as="xs:string">
<xsl:param name="soup" as="xs:string"/>
<xsl:value-of
select="replace($soup, '&lt;(img|input|hr|br)[^>]*[^/]&gt;', '')"
/>
</xsl:function>
<xsl:function name="scdh:html-body" as="xs:string">
<xsl:param name="url" as="xs:string"/>
<xsl:variable name="htmlentities">
<xsl:call-template name="entities"/>
</xsl:variable>
<xsl:value-of select="
concat(
$htmlentities,
'&lt;body&gt;&#xa;',
scdh:drain(
substring-before(substring-after(unparsed-text($url), '&lt;body'), '&lt;/body&gt;')),
'&lt;/body&gt;&#xa;')"
/>
</xsl:function>
<xsl:template name="start">
<xsl:variable name="body" select="scdh:html-body($infile)"/>
<xsl:choose>
<xsl:when test="$parsed">
<xsl:apply-templates select="parse-xml($body)"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$body"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
<?xml version="1.0" encoding="UTF-8"?>
<!-- Mittleres Alter der Literaturnobelpreisträger*innen
zum Zeitpunkt der Preisverleihung, vgl.
https://twitter.com/patrick_sahle/status/1352169831131451394
USAGE:
java -jar saxon.jar -xsl:mittelalter.xsl -it:mittel infile=https://de.wikipedia.org/wiki/Liste_der_Nobelpreistr%C3%A4ger_f%C3%BCr_Literatur
-->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xpath-default-namespace="ad.hoc"
version="3.0">
<xsl:output method="text"/>
<!-- Wiederverwendung vorliegenden Codes -->
<xsl:import href="alternobel.xsl"/>
<!-- Beispiel für Komposition von Transformationen -->
<xsl:template name="mittel">
<xsl:variable name="preise">
<xsl:call-template name="start"/>
</xsl:variable>
<xsl:value-of select="avg($preise//age[matches(., '[0-9]+')])"/>
<xsl:text>&#xa;</xsl:text>
</xsl:template>
</xsl:stylesheet>
<?xml version="1.0" encoding="UTF-8"?>
<!--
xmlout.xsl - copy of the html-body
USAGE:
java -jar saxon.jar -xsl:xmlout.xsl -it:start [parsed=(true|false)] infile=<URL>
REQUIRED:
saxon >= 9.6
-->
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
version="3.0">
<xsl:output method="xml" indent="yes"/>
<xsl:mode on-no-match="shallow-copy"/>
<xsl:import href="html2xml.xsl"/>
</xsl:stylesheet>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment