kschlottmann · October 7, 2021 15:04
diff --git a/readme.txt b/readme.txt
 These instructions allow one to begin with the Columbia Word template for container lists, and extract the pipe-delimited data for use in the Yale Excel-to-dsc or Harvard-Excel-AS plugin for creating hierarchical description. Generated May 2019 by Kevin Schlottmann. Added level/style version, October 2021.

 1) Save the container list in the Columbia Word template format.

 2) Save that file as a Word XML 2003 format. 

 3) Open the XML Word file in oXygen.

 4) Download and open in the oXygen the XSLT Word-to-tsv.xsl from gist 
 https://gist.github.com/kschlottmann/6eaa8a027d1fc889fe17490c3105ef31

 4a) Depending on whether the Word template uses tables or levels/styles to format data, choose the correct XSLT here  

 5) In the XSLT debugger view, run the transformation.

 6) IMPORTANT -- Inspect the output to ensure that no data is missing, and that data is lined up. Note in particular that the title/date split is based on spltting via a comma; this is not always going to work. Possible future improvement: add template to split on last comma. 

 7) Adjust XSLT if needed.

 8) Run transformation, copy output, and paste into Excel.

 9) Split text into columns, splitting on the pipe "|" character

 10) Handy tip - to grab information in parenthesis a copy to new column, use this Excel formula:
 =MID(C16,SEARCH("(",C16)+1,SEARCH(")",C16)-SEARCH("(",C16)-1)

diff --git a/Word-to-tsv_styles.xsl b/Word-to-tsv_styles.xsl
 <?xml version="1.0" encoding="UTF-8"?>
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:aml="http://schemas.microsoft.com/aml/2001/core" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
    xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
    xmlns:wsp="http://schemas.microsoft.com/office/word/2003/wordml/sp2" xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core" exclude-result-prefixes="#all" version="2.0">
    <xsl:output omit-xml-declaration="yes"/>
    <xsl:template match="/">
        
        <!--NB: review the Word document in Word and in XML manually, to check for any discrepancies or missed data -->
 <!--  NB: Written for the Word template that uses levels and styles instead of table rows -->
        
        <!--      headers; adjust if needed. Additional blank fields could be added to allow for easier copy and paste into the Excel template  -->
        
        <xsl:text>LEVEL|ITAL?|TITLE and DATE</xsl:text>
        <xsl:text>&#xa;</xsl:text>
        
        <!--        loop to grab values from each paragraph tag -->
        <xsl:for-each select="//w:p">
            <xsl:value-of select="w:pPr/w:pStyle/@w:val"/>
            <xsl:text>|</xsl:text>
            <xsl:value-of select="w:r/w:rPr/w:rStyle/@w:val"/>
            <xsl:text>|</xsl:text>
            <xsl:value-of select="w:r/w:t"/>
            <xsl:text>&#xa;</xsl:text>
        </xsl:for-each>
    </xsl:template>
    
    
    
    
 </xsl:stylesheet>
diff --git a/Word-to-tsv_tables.xsl b/Word-to-tsv_tables.xsl
 <?xml version="1.0" encoding="UTF-8"?>
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:aml="http://schemas.microsoft.com/aml/2001/core" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
    xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
    xmlns:wsp="http://schemas.microsoft.com/office/word/2003/wordml/sp2" xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core" exclude-result-prefixes="#all" version="2.0">
    <xsl:output omit-xml-declaration="yes"/>
    <xsl:template match="/">

        <!--NB: review the Word document in Word and in XML manually, to check for any discrepancies or missed data -->
        <!--NB: only series and file are provided as levels. if subseries or other intermediate levels are needed, the first choose statement would have to be modified to test for those strings in the val attribute  -->

        <!--      headers; adjust if needed. Additional blank fields could be added to allow for easier copy and paste into the Excel template  -->

        <xsl:text>LEVEL|TYPE|TITLE|DATE|BOX|FOLDER</xsl:text>
        <xsl:text>&#xa;</xsl:text>

        <!--        loop to grab values from each table row -->
        <xsl:for-each select="//w:tr">
            <!--         LEVEL   -->
            <!--           test the val attribute to see if contains the string series; if it does, strip out the letters and grab the number for the level -->
            <xsl:choose>
                <xsl:when test="contains(w:tc[2]/w:p/w:pPr/w:pStyle/@w:val, 'Series')">
                    <xsl:analyze-string select="w:tc[2]/w:p/w:pPr/w:pStyle/@w:val" regex="\d+">
                        <xsl:matching-substring>
                            <xsl:value-of select="."/>
                        </xsl:matching-substring>
                    </xsl:analyze-string>
                    <xsl:text>|</xsl:text>
                    <!--                    insert "series" as a TYPE -->
                    <xsl:text>series</xsl:text>
                    <xsl:text>|</xsl:text>
                </xsl:when>
                <!-- otherwise just grab the level number -->
                <xsl:otherwise>
                    <xsl:value-of select="substring-after(w:tc[2]/w:p/w:pPr/w:pStyle/@w:val, 'Level')"/>
                    <xsl:text>|</xsl:text>

                    <!--                    insert "file" as a TYPE -->
                    <xsl:text>file</xsl:text>
                    <xsl:text>|</xsl:text>
                </xsl:otherwise>
            </xsl:choose>

            <!--          TITLE  and DATE -->
            <xsl:choose>
                <!--               test to choose between where there is a comma (split for date) and when there isn't -->
                <xsl:when test="contains(w:tc[2], ',')">
                    <xsl:value-of select="substring-before(normalize-space(w:tc[2]), ',')"/>
                    <xsl:text>|</xsl:text>
                    <!--            date (careful!) -->
                    <xsl:value-of select="substring-after(normalize-space(w:tc[2]), ',')"/>
                    <xsl:text>|</xsl:text>
                </xsl:when>
                <xsl:otherwise>
                    <xsl:value-of select="w:tc[2]//w:t"/>
                    <xsl:text>|</xsl:text>
                    <!--            empty date column place holder for when there is no comma -->
                    <xsl:text>|</xsl:text>

                </xsl:otherwise>
            </xsl:choose>

            <xsl:choose>
                <!--               test to choose between where there is a starr (split for box folder ) and when there isn't -->
                <xsl:when test="contains(w:tc[1], '*')">
                    <!--          BOX  -->
                    <xsl:value-of select="substring-before(normalize-space(w:tc[1]), '*')"/>
                    <xsl:text>|</xsl:text>

                    <!--            FOLDER  -->
                    <xsl:value-of select="substring-after(normalize-space(w:tc[1]), '*')"/>
                    <xsl:text>|</xsl:text>
                    <xsl:text>&#xa;</xsl:text>
                </xsl:when>
                <xsl:otherwise>
                    <xsl:value-of select="w:tc[1]//w:t"/>
                    <xsl:text>|</xsl:text>
                    <!--            empty folder  column place holder for when there is no folder -->
                    <xsl:text>|</xsl:text>
                    <xsl:text>&#xa;</xsl:text>
                </xsl:otherwise>
            </xsl:choose>

        </xsl:for-each>
    </xsl:template>




 </xsl:stylesheet>
	These instructions allow one to begin with the Columbia Word template for container lists, and extract the pipe-delimited data for use in the Yale Excel-to-dsc or Harvard-Excel-AS plugin for creating hierarchical description. Generated May 2019 by Kevin Schlottmann. Added level/style version, October 2021.

	1) Save the container list in the Columbia Word template format.

	2) Save that file as a Word XML 2003 format.

	3) Open the XML Word file in oXygen.

	4) Download and open in the oXygen the XSLT Word-to-tsv.xsl from gist
	https://gist.github.com/kschlottmann/6eaa8a027d1fc889fe17490c3105ef31

	4a) Depending on whether the Word template uses tables or levels/styles to format data, choose the correct XSLT here

	5) In the XSLT debugger view, run the transformation.

	6) IMPORTANT -- Inspect the output to ensure that no data is missing, and that data is lined up. Note in particular that the title/date split is based on spltting via a comma; this is not always going to work. Possible future improvement: add template to split on last comma.

	7) Adjust XSLT if needed.

	8) Run transformation, copy output, and paste into Excel.

	9) Split text into columns, splitting on the pipe "\|" character

	10) Handy tip - to grab information in parenthesis a copy to new column, use this Excel formula:
	=MID(C16,SEARCH("(",C16)+1,SEARCH(")",C16)-SEARCH("(",C16)-1)
	<?xml version="1.0" encoding="UTF-8"?>
	<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:aml="http://schemas.microsoft.com/aml/2001/core" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
	xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml"
	xmlns:wsp="http://schemas.microsoft.com/office/word/2003/wordml/sp2" xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core" exclude-result-prefixes="#all" version="2.0">
	<xsl:output omit-xml-declaration="yes"/>
	<xsl:template match="/">

	<!--NB: review the Word document in Word and in XML manually, to check for any discrepancies or missed data -->
	<!-- NB: Written for the Word template that uses levels and styles instead of table rows -->

	<!-- headers; adjust if needed. Additional blank fields could be added to allow for easier copy and paste into the Excel template -->

	<xsl:text>LEVEL\|ITAL?\|TITLE and DATE</xsl:text>
	<xsl:text> </xsl:text>

	<!-- loop to grab values from each paragraph tag -->
	<xsl:for-each select="//w:p">
	<xsl:value-of select="w:pPr/w:pStyle/@w:val"/>
	<xsl:text>\|</xsl:text>
	<xsl:value-of select="w:r/w:rPr/w:rStyle/@w:val"/>
	<xsl:text>\|</xsl:text>
	<xsl:value-of select="w:r/w:t"/>
	<xsl:text> </xsl:text>
	</xsl:for-each>
	</xsl:template>




	</xsl:stylesheet>