Created
November 30, 2020 00:14
-
-
Save DavidJRobertson/7948a8d907ff27cf37c2a46e271d5192 to your computer and use it in GitHub Desktop.
Pubchem LCSS data mangling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs" version="2.0"> | |
<xsl:output method="xml" encoding="utf-8" indent="yes"/> | |
<xsl:template match="/"> | |
<Chemicals> | |
<xsl:apply-templates/> | |
</Chemicals> | |
</xsl:template> | |
<xsl:template match="Chemical"> | |
<Chemical> | |
<xsl:variable name="deduped-props"> | |
<xsl:call-template name="dedup-properties"> | |
<xsl:with-param name="chemprops" select="Properties/*"/> | |
</xsl:call-template> | |
</xsl:variable> | |
<Properties> | |
<xsl:sequence select="$deduped-props"/> | |
</Properties> | |
<xsl:variable name="used-refs" | |
select="sort(distinct-values(tokenize(string-join($deduped-props/*/@refs, ' '), ' ')))"/> | |
<xsl:if test="not(empty($used-refs))"> | |
<References> | |
<xsl:for-each select="References/*[@number = $used-refs]"> | |
<xsl:sort select="@number" data-type="number"/> | |
<xsl:copy-of select="."/> | |
</xsl:for-each> | |
</References> | |
</xsl:if> | |
</Chemical> | |
</xsl:template> | |
<xsl:template name="dedup-properties"> | |
<xsl:param name="chemprops" required="yes"/> | |
<xsl:for-each-group select="$chemprops" group-by="name()"> | |
<xsl:sort select="current-grouping-key()"/> | |
<xsl:variable name="prop" select="current-grouping-key()"/> | |
<xsl:for-each-group select="current-group()" group-by="normalize-space(text())"> | |
<xsl:sort select="text()"/> | |
<xsl:element name="{$prop}"> | |
<xsl:copy-of select="@*[name() != 'refs']"/> | |
<xsl:call-template name="merged-refs"> | |
<xsl:with-param name="allrefs" select="current-group()/@refs"/> | |
</xsl:call-template> | |
<xsl:value-of select="current-grouping-key()"/> | |
</xsl:element> | |
</xsl:for-each-group> | |
</xsl:for-each-group> | |
</xsl:template> | |
<xsl:template name="merged-refs"> | |
<xsl:param name="allrefs" required="yes"/> | |
<xsl:variable name="split" select="tokenize(string-join($allrefs, ' '), ' ')"/> | |
<xsl:variable name="sorted" select="sort(distinct-values(sort($split)))"/> | |
<xsl:variable name="joined" select="string-join($split, ' ')"/> | |
<xsl:if test="$joined != ''"> | |
<xsl:attribute name="refs" select="$joined"/> | |
</xsl:if> | |
</xsl:template> | |
</xsl:stylesheet> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:pv="http://pubchem.ncbi.nlm.nih.gov/pug_view" | |
exclude-result-prefixes="xs pv" version="2.0" input-type-annotations="strip" | |
xpath-default-namespace="http://pubchem.ncbi.nlm.nih.gov/pug_view"> | |
<xsl:param name="includeGhsHazardStatements" as="xs:boolean" select="true()"/> | |
<xsl:param name="includeGhsPrecautionaryStatements" as="xs:boolean" select="true()"/> | |
<xsl:param name="includeGhsDescriptions" as="xs:boolean" select="false()"/> | |
<xsl:param name="includeReferences" as="xs:boolean" select="true()"/> | |
<xsl:output method="xml" encoding="utf-8" indent="yes"/> | |
<xsl:template match="*" mode="#all"> | |
<xsl:apply-templates mode="#current"/> | |
</xsl:template> | |
<xsl:template match="text()" mode="#all"/> | |
<xsl:template match="/"> | |
<Chemicals> | |
<xsl:apply-templates/> | |
</Chemicals> | |
</xsl:template> | |
<xsl:template match="Record"> | |
<Chemical> | |
<Properties> | |
<Name> | |
<xsl:value-of select="@title"/> | |
</Name> | |
<PubchemCID> | |
<xsl:text>CID-</xsl:text> | |
<xsl:value-of select="@number"/> | |
</PubchemCID> | |
<xsl:apply-templates select="*[name() != 'References']"/> | |
</Properties> | |
<xsl:apply-templates select="References"/> | |
</Chemical> | |
</xsl:template> | |
<!-- CHEMICAL PROPERTIES --> | |
<xsl:template match="Section[@heading = 'Molecular Formula']/InfoVal"> | |
<Formula> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="String"/> | |
</Formula> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'Molecular Weight']/InfoVal"> | |
<MolarMass> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="Number"/> | |
<xsl:if test="Unit"> | |
<xsl:text> </xsl:text> | |
<xsl:value-of select="Unit"/> | |
</xsl:if> | |
</MolarMass> | |
</xsl:template> | |
<!-- GHS DATA --> | |
<xsl:template match="Section[@heading = 'GHS Classification']"> | |
<xsl:apply-templates mode="ghs"/> | |
</xsl:template> | |
<xsl:template match="InfoVal[@name = 'Pictogram(s)']/GHSPictogram" mode="ghs"> | |
<GHSPictogram> | |
<xsl:call-template name="ref"> | |
<xsl:with-param name="subject" select=".."/> | |
</xsl:call-template> | |
<xsl:if test="$includeGhsDescriptions"> | |
<xsl:attribute name="description" select="@type"/> | |
</xsl:if> | |
<xsl:value-of select="@code"/> | |
</GHSPictogram> | |
</xsl:template> | |
<xsl:template match="InfoVal[@name = 'Signal']" mode="ghs"> | |
<GHSSignalWord> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="String"/> | |
</GHSSignalWord> | |
</xsl:template> | |
<xsl:template match="InfoVal[@name = 'GHS Hazard Statements']" mode="ghs"> | |
<xsl:if test="$includeGhsHazardStatements"> | |
<xsl:variable name="infoval" select="."/> | |
<xsl:analyze-string select="normalize-space(String)" | |
regex="(H\d+)( \([\d\.]+%\))?: (.*)"> | |
<xsl:matching-substring> | |
<GHSHazardStatement> | |
<xsl:call-template name="ref"> | |
<xsl:with-param name="subject" select="$infoval"/> | |
</xsl:call-template> | |
<xsl:if test="$includeGhsDescriptions"> | |
<xsl:attribute name="description" select="regex-group(3)"/> | |
</xsl:if> | |
<xsl:value-of select="regex-group(1)"/> | |
</GHSHazardStatement> | |
</xsl:matching-substring> | |
</xsl:analyze-string> | |
</xsl:if> | |
</xsl:template> | |
<xsl:template match="InfoVal[@name = 'Precautionary Statement Codes']" mode="ghs"> | |
<xsl:if test="$includeGhsPrecautionaryStatements"> | |
<xsl:variable name="infoval" select="."/> | |
<xsl:analyze-string select="String" regex="(P\d+)(\+P\d+)*"> | |
<xsl:matching-substring> | |
<GHSPrecautionaryStatement> | |
<xsl:call-template name="ref"> | |
<xsl:with-param name="subject" select="$infoval"/> | |
</xsl:call-template> | |
<xsl:value-of select="."/> | |
</GHSPrecautionaryStatement> | |
</xsl:matching-substring> | |
</xsl:analyze-string> | |
</xsl:if> | |
</xsl:template> | |
<!-- IDENTIFIERS --> | |
<xsl:template match="Section[@heading = 'Identifiers']"> | |
<xsl:apply-templates mode="ident"/> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'CAS']/InfoVal" mode="ident"> | |
<CASNumber> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="String"/> | |
</CASNumber> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'InChI']/InfoVal" mode="ident"> | |
<InChI> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="String"/> | |
</InChI> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'InChI Key']/InfoVal" mode="ident"> | |
<InChIKey> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="String"/> | |
</InChIKey> | |
</xsl:template> | |
<!-- PHYSICAL PROPERTIES --> | |
<xsl:template match="Section[@heading = 'Physical Properties']"> | |
<xsl:apply-templates mode="phys"/> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'Boiling Point']/InfoVal" mode="phys"> | |
<BoilingPoint> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="pv:fixDegrees(String)"/> | |
</BoilingPoint> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'Melting Point']/InfoVal" mode="phys"> | |
<MeltingPoint> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="pv:fixDegrees(String)"/> | |
</MeltingPoint> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'Flash Point']/InfoVal" mode="phys"> | |
<FlashPoint> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="pv:fixDegrees(String)"/> | |
</FlashPoint> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'Autoignition Temperature']/InfoVal" mode="phys"> | |
<AutoignitionTemperature> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="pv:fixDegrees(String)"/> | |
</AutoignitionTemperature> | |
</xsl:template> | |
<xsl:template match="Section[@heading = 'Density']/InfoVal" mode="phys"> | |
<Density> | |
<xsl:call-template name="ref"/> | |
<xsl:value-of select="pv:fixDegrees(String)"/> | |
</Density> | |
</xsl:template> | |
<xsl:function name="pv:fixDegrees"> | |
<xsl:param name="input" as="xs:string"/> | |
<xsl:value-of select="normalize-space(replace(replace($input, '°', '°'), '°', ' °'))"/> | |
</xsl:function> | |
<!-- REFERENCES --> | |
<xsl:template name="ref"> | |
<xsl:param name="subject" select="."/> | |
<xsl:if test="$includeReferences"> | |
<xsl:attribute name="refs" select="normalize-space(string-join((string-join($subject/Ref/@refno, ' '), (string-join($subject/@refno, ' '))), ' '))"/> | |
</xsl:if> | |
</xsl:template> | |
<xsl:template match="References"> | |
<xsl:if test="$includeReferences"> | |
<References> | |
<xsl:apply-templates mode="refs"/> | |
</References> | |
</xsl:if> | |
</xsl:template> | |
<xsl:template match="Reference" mode="refs"> | |
<Reference> | |
<xsl:attribute name="number" select="ReferenceNumber"/> | |
<xsl:if test="ANID"> | |
<xsl:attribute name="anid" select="ANID"/> | |
</xsl:if> | |
<xsl:if test="SourceID"> | |
<xsl:attribute name="sourceid" select="SourceID"/> | |
</xsl:if> | |
<xsl:if test="URL"> | |
<xsl:attribute name="url" select="URL"/> | |
</xsl:if> | |
<xsl:if test="IsToxnet"> | |
<xsl:attribute name="istoxnet" select="IsToxnet"/> | |
</xsl:if> | |
<xsl:if test="Name"> | |
<xsl:attribute name="name" select="Name"/> | |
</xsl:if> | |
<xsl:value-of select="SourceName"/> | |
</Reference> | |
</xsl:template> | |
</xsl:stylesheet> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<p:declare-step xmlns:p="http://www.w3.org/ns/xproc" xmlns:c="http://www.w3.org/ns/xproc-step" | |
version="1.0"> | |
<p:input port="source"/> | |
<p:output port="result"/> | |
<p:xslt name="simplify"> | |
<p:input port="stylesheet"> | |
<p:document href="lcss-simplify.xsl"/> | |
</p:input> | |
<p:input port="parameters"> | |
<p:empty/> | |
</p:input> | |
</p:xslt> | |
<p:xslt name="chemicalize"> | |
<p:input port="stylesheet"> | |
<p:document href="lcss-chemicalize.xsl"/> | |
</p:input> | |
<p:input port="parameters"> | |
<p:empty/> | |
</p:input> | |
</p:xslt> | |
<p:xslt name="chemical-dedup"> | |
<p:input port="stylesheet"> | |
<p:document href="chemical-property-dedup.xsl"/> | |
</p:input> | |
<p:input port="parameters"> | |
<p:empty/> | |
</p:input> | |
</p:xslt> | |
</p:declare-step> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | |
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns="http://pubchem.ncbi.nlm.nih.gov/pug_view" | |
xmlns:pv="http://pubchem.ncbi.nlm.nih.gov/pug_view" exclude-result-prefixes="xs pv" | |
version="2.0" input-type-annotations="strip" | |
xpath-default-namespace="http://pubchem.ncbi.nlm.nih.gov/pug_view"> | |
<xsl:output method="xml" encoding="utf-8" indent="yes"/> | |
<xsl:template match="*" mode="#all"/> | |
<xsl:template match="/"> | |
<xsl:apply-templates select="*"/> | |
</xsl:template> | |
<xsl:template match="Record"> | |
<Record> | |
<xsl:attribute name="type" select="RecordType"/> | |
<xsl:attribute name="number" select="RecordNumber"/> | |
<xsl:attribute name="title" select="RecordTitle"/> | |
<xsl:apply-templates select="Section"/> | |
<References> | |
<xsl:copy-of select="Reference"/> | |
</References> | |
</Record> | |
</xsl:template> | |
<xsl:template match="Section"> | |
<Section> | |
<xsl:attribute name="heading" select="TOCHeading"/> | |
<xsl:if test="URL"> | |
<xsl:attribute name="url" select="URL"/> | |
</xsl:if> | |
<xsl:apply-templates select="Section | Information"/> | |
</Section> | |
</xsl:template> | |
<xsl:template match="Information | Value"> | |
<xsl:apply-templates select="*"/> | |
</xsl:template> | |
<xsl:template match="Information/Value/StringWithMarkup"> | |
<InfoVal> | |
<xsl:call-template name="referenced"> | |
<xsl:with-param name="subject" select="../.."/> | |
</xsl:call-template> | |
<xsl:variable name="string" select="normalize-space(String)"/> | |
<xsl:if test="$string != ''"> | |
<String> | |
<xsl:value-of select="$string"/> | |
</String> | |
</xsl:if> | |
<xsl:apply-templates select="Markup"/> | |
</InfoVal> | |
</xsl:template> | |
<xsl:template match="Markup"> | |
<xsl:copy-of select="."/> | |
</xsl:template> | |
<xsl:template match="Markup[Type[text() = 'PubChem Internal Link']]"/> | |
<xsl:template match="Markup[Type[text() = 'Color']]"/> | |
<xsl:template | |
match="Markup[Type[text() = 'Icon'] and Extra and URL[starts-with(text(), 'https://pubchem.ncbi.nlm.nih.gov/images/ghs/GHS')]]"> | |
<GHSPictogram> | |
<xsl:attribute name="type" select="Extra"/> | |
<xsl:attribute name="code" | |
select="substring-before(substring-after(URL, 'https://pubchem.ncbi.nlm.nih.gov/images/ghs/'), '.svg')" | |
/> | |
</GHSPictogram> | |
</xsl:template> | |
<xsl:template match="Information/Value[not(StringWithMarkup)]"> | |
<InfoVal> | |
<xsl:call-template name="referenced"> | |
<xsl:with-param name="subject" select=".."/> | |
</xsl:call-template> | |
<xsl:copy-of select="*"/> | |
</InfoVal> | |
</xsl:template> | |
<xsl:template name="referenced"> | |
<xsl:param name="subject" required="yes"/> | |
<xsl:if test="$subject/ReferenceNumber"> | |
<xsl:attribute name="refno" select="$subject/ReferenceNumber"/> | |
</xsl:if> | |
<xsl:if test="$subject/Reference"> | |
<xsl:attribute name="ref" select="normalize-space($subject/Reference)"/> | |
</xsl:if> | |
<xsl:if test="$subject/Name"> | |
<xsl:attribute name="name" select="$subject/Name"/> | |
</xsl:if> | |
</xsl:template> | |
</xsl:stylesheet> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment