Created
February 10, 2021 20:48
-
-
Save lueck/d7e47d8b481d13b93d2af5213bb8d1de to your computer and use it in GitHub Desktop.
XSLT for scraping HTML, tolerant against tag soup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?xml version="1.0" encoding="UTF-8"?> | |
| <!-- | |
| How old were poets, when they were nobel praised? | |
| This is an example of data aquisition from HTML using XSLT. | |
| It make use of html2xml.xsl | |
| USAGE: | |
| java -jar saxon.jar -xsl:alternobel.xsl -it:start infile=https://de.wikipedia.org/wiki/Liste_der_Nobelpreistr%C3%A4ger_f%C3%BCr_Literatur | |
| --> | |
| <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
| xmlns:xs="http://www.w3.org/2001/XMLSchema" | |
| xmlns:scdh="http://www.wwu.de/scdh#data-aquisition" | |
| xmlns="ad.hoc" | |
| exclude-result-prefixes="xs scdh" | |
| version="3.0"> | |
| <!-- Import von html2xml.xsl --> | |
| <xsl:include href="html2xml.xsl"/> | |
| <!-- body ist das Root-Element --> | |
| <xsl:template match="/body"> | |
| <nobel> | |
| <!-- Wähle alle table-Element, für die gilt: Es gibt auf | |
| der Achse der Nachfahren (d.h. Kind-Elemente und | |
| Kind-Elemente der Kind-Element usw.) ein tr-Element, | |
| für das gilt: es hat 5 td-Kind-Elemente. Wähle von | |
| diesen table-Elementen die tr-Elemente in beliebiger | |
| Tiefe, für die gilt: der Inhalt des ersten | |
| td-Kind-Element ist eine Zahl aus arabischen Ziffern. | |
| --> | |
| <xsl:apply-templates select="//table[descendant::tr[count(td) eq 5]]//tr[matches(td[1], '[0-9]+')]"/> | |
| </nobel> | |
| </xsl:template> | |
| <!-- Verarbeitung der oben ausgewählten tr-Elemente --> | |
| <xsl:template match="tr"> | |
| <verleihung> | |
| <!-- extrahiere das Geburtsjahr --> | |
| <xsl:variable name="born" select="number(normalize-space(replace(string-join(td[2]/text(), ''), '\((\*\s*)?([0-9]{4,4})[–)]+.*', '$2')))"/> | |
| <!-- extrahiere das Jahr der Preisverleihung --> | |
| <xsl:variable name="year" select="number(normalize-space(string-join(td[1]/text(), '')))"/> | |
| <!-- Ausgabe der neuen XML-Elemente --> | |
| <year><xsl:value-of select="$year"/></year> | |
| <name><xsl:value-of select="td[2]/a[1]"/></name> | |
| <born><xsl:value-of select="$born"/></born> | |
| <age><xsl:value-of select="$year - $born"/></age> | |
| </verleihung> | |
| </xsl:template> | |
| </xsl:stylesheet> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?xml version="1.0" encoding="UTF-8"?> | |
| <!-- | |
| html2xml.xsl - XSLT for parsing HTML as if it was X(HT)ML | |
| USAGE: | |
| java -jar saxon.jar -xsl:html2xml.xsl -it:start [parsed=(true|false)] infile=<URL> | |
| The input file is not given as a source file to be read by saxon's | |
| XML parser. Instead, it is given as a stylesheet parameter called "infile" | |
| and is read as unparsed text in a first step. Everything but the html body | |
| is drained then, and some well known non-wellformed elements are drained, | |
| too. First then the resulting string is parsed with the XSLT engine's XML | |
| parser. Instead of starting with a source file's document root, the XSLT | |
| engine starts with a named template called "start". | |
| With the stylesheet parameter parsed=false XML parsing by the XSLT engine | |
| can be switched off in order to inspect the string feed to the parser. Note, | |
| that you would have to set the output method to "text" if you want to have | |
| real tags then. | |
| The stylesheet is to be reused with <xsl:include> or <xsl:import>. A template | |
| matching the XPath expression "/body" can be used as a starting point. | |
| The stylesheet loads definitions for named entities from the web, because | |
| named entities must be given explicitely in XML. | |
| There is also a function named scdh:html-body($url as xs:string) that returns | |
| the body of a referenced html document as a string with tag soup drained. You | |
| can use it together with parse-xml() like doc($uri as:string?): | |
| parse-xml(scdh:html-body($url))/path/to/valuable/data | |
| NOTE: | |
| If you get a parser error with this stylesheet, you probably will be better | |
| off using tidy (https://www.html-tidy.org), to clean up the tag soup and | |
| then use XSLT. | |
| EXAMPLES: | |
| java saxon-he.jar -xsl:html2xml.xsl -it:start infile="https://de.wikipedia.org/wiki/Liste_der_Nobelpreistr%C3%A4ger_f%C3%BCr_Literatur" | |
| java saxon-he.jar -xsl:html2xml.xsl -it:start infile=my-local.html parsed=false | |
| REQUIRED: saxon >= 9.6 | |
| (c) Christian Lück, 2021. LICENSE: MIT | |
| --> | |
| <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
| xmlns:xs="http://www.w3.org/2001/XMLSchema" | |
| xmlns:scdh="http://www.wwu.de/scdh#data-aquisition" | |
| exclude-result-prefixes="xs scdh" | |
| version="3.0"> | |
| <xsl:output method="xml" indent="yes"/> | |
| <!-- url of infile --> | |
| <xsl:param name="infile" as="xs:string" required="yes"/> | |
| <!-- Note, that you would have to reset output-method to text to get real tags. --> | |
| <xsl:param name="parsed" as="xs:boolean" required="false" select="true()"/> | |
| <xsl:template name="entities"> | |
| <!DOCTYPE body [ <!ENTITY % w3centities-f PUBLIC | |
| "-//W3C//ENTITIES Combined Set//EN//XML" | |
| "http://www.w3.org/2003/entities/2007/w3centities-f.ent" > | |
| %w3centities-f; | |
| ]> | |
| </xsl:template> | |
| <!-- the drain is where the tag soup goes down --> | |
| <xsl:function name="scdh:drain" as="xs:string"> | |
| <xsl:param name="soup" as="xs:string"/> | |
| <xsl:value-of | |
| select="replace($soup, '<(img|input|hr|br)[^>]*[^/]>', '')" | |
| /> | |
| </xsl:function> | |
| <xsl:function name="scdh:html-body" as="xs:string"> | |
| <xsl:param name="url" as="xs:string"/> | |
| <xsl:variable name="htmlentities"> | |
| <xsl:call-template name="entities"/> | |
| </xsl:variable> | |
| <xsl:value-of select=" | |
| concat( | |
| $htmlentities, | |
| '<body>
', | |
| scdh:drain( | |
| substring-before(substring-after(unparsed-text($url), '<body'), '</body>')), | |
| '</body>
')" | |
| /> | |
| </xsl:function> | |
| <xsl:template name="start"> | |
| <xsl:variable name="body" select="scdh:html-body($infile)"/> | |
| <xsl:choose> | |
| <xsl:when test="$parsed"> | |
| <xsl:apply-templates select="parse-xml($body)"/> | |
| </xsl:when> | |
| <xsl:otherwise> | |
| <xsl:value-of select="$body"/> | |
| </xsl:otherwise> | |
| </xsl:choose> | |
| </xsl:template> | |
| </xsl:stylesheet> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?xml version="1.0" encoding="UTF-8"?> | |
| <!-- Mittleres Alter der Literaturnobelpreisträger*innen | |
| zum Zeitpunkt der Preisverleihung, vgl. | |
| https://twitter.com/patrick_sahle/status/1352169831131451394 | |
| USAGE: | |
| java -jar saxon.jar -xsl:mittelalter.xsl -it:mittel infile=https://de.wikipedia.org/wiki/Liste_der_Nobelpreistr%C3%A4ger_f%C3%BCr_Literatur | |
| --> | |
| <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
| xpath-default-namespace="ad.hoc" | |
| version="3.0"> | |
| <xsl:output method="text"/> | |
| <!-- Wiederverwendung vorliegenden Codes --> | |
| <xsl:import href="alternobel.xsl"/> | |
| <!-- Beispiel für Komposition von Transformationen --> | |
| <xsl:template name="mittel"> | |
| <xsl:variable name="preise"> | |
| <xsl:call-template name="start"/> | |
| </xsl:variable> | |
| <xsl:value-of select="avg($preise//age[matches(., '[0-9]+')])"/> | |
| <xsl:text>
</xsl:text> | |
| </xsl:template> | |
| </xsl:stylesheet> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?xml version="1.0" encoding="UTF-8"?> | |
| <!-- | |
| xmlout.xsl - copy of the html-body | |
| USAGE: | |
| java -jar saxon.jar -xsl:xmlout.xsl -it:start [parsed=(true|false)] infile=<URL> | |
| REQUIRED: | |
| saxon >= 9.6 | |
| --> | |
| <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
| version="3.0"> | |
| <xsl:output method="xml" indent="yes"/> | |
| <xsl:mode on-no-match="shallow-copy"/> | |
| <xsl:import href="html2xml.xsl"/> | |
| </xsl:stylesheet> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment