abits · April 15, 2014 21:48
diff --git a/README.txt b/README.txt
 This gist contains two files for simple indexing of PDF files.

 == requirements ==
 First you need to install Solr (which requires a Java JDK): Download a tar or zipfile at http://www.apache.org/dyn/closer.cgi/lucene/solr/ and unpack it to a directory of your choice. Go into this directory and start solr running in jetty by:

 $ cd example
 $ java -jar start.jar

 Then locate your browser to http://localhost:8983/solr/

 == data extraction ==
 Metadata and text is extracted from PDF files with 'xpdf'.
 An alternative to xpdf is PDFbox: http://pdfbox.apache.org/.
 You should also have a look at http://aperture.sourceforge.net/ 
 to extract (meta)data from PDF and other files but aperture seems to depend
 on half the internet (or I just don't like overblown Java frameworks).

 == usage ==
 Put pdf2solr.sh and html2solr.xsl in a directory of your choice and make it 
 executable. Then just call pdf2solr.sh with one or more PDF files as arguments.

 == limitations ==
 The current version only indexes text, author and title fields.
diff --git a/html2solr.xsl b/html2solr.xsl
 <?xml version="1.0" encoding="UTF-8"?>
 <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
      xmlns:m="http://example.com" exclude-result-prefixes="xsl m">

 <xsl:output method="xml" indent="yes" encoding="UTF-8"/>

 <mapping xmlns="http://example.com">
  <map meta="Author" field="author"/>
  <map meta="Title" field="title"/>
 </mapping>

 <xsl:param name="ID"/>
 <xsl:variable name="keepunknownfields" select="false()"/>
 <xsl:variable name="textfield" select="'text'"/>
 <xsl:variable name="mapping" select="document('')//m:map"/>

 <xsl:template match="/html">
  <add>
    <doc>
      <field name="id"><xsl:value-of select="$ID"/></field> <!-- TODO: ID must not be empty -->
      <xsl:for-each select="head/meta">
        <xsl:variable name="meta" select="@name"/>
        <xsl:variable name="map2" select="$mapping[@meta=$meta]/@field"/>
        <xsl:if test="$map2 or $keepunknownfields">
        <field>
          <xsl:attribute name="name">
            <xsl:if test="$map2"><xsl:value-of select="$map2"/></xsl:if>
            <xsl:if test="not($map2)"><xsl:value-of select="$meta"/></xsl:if>
          </xsl:attribute>
          <xsl:value-of select="@content"/>
        </field>
        </xsl:if>
      </xsl:for-each>
      <field name="{$textfield}">
        <xsl:value-of select="body"/>
      </field>
    </doc>
  </add>
 </xsl:template>

 </xsl:stylesheet>
diff --git a/pdf2solr.sh b/pdf2solr.sh
 #!/bin/bash

 # simple PDF indexer for Solr

 FILES=$*
 TMPFILE=~tmp
 HTML2SOLR=html2solr.xsl
 URL=http://localhost:8983/solr/update

 for PDF in $FILES; do
  [ -r "$PDF" ] || continue

  SHA=`shasum "$PDF" |awk '{print $1}'`

  pdftotext -htmlmeta "$PDF" $TMPFILE.htm
  xmllint --xmlout --dropdtd --html $TMPFILE.htm 2> /dev/null > $TMPFILE.xhtml
  xsltproc --param ID "'"$SHA"'" $HTML2SOLR $TMPFILE.xhtml > $TMPFILE.xml
  # TODO: xsltproc may fail because of broken XML

  f="$TMPFILE.xml"
  echo "Posting file $f to $URL as $SHA"
  curl $URL --data-binary @$f -H 'Content-type:text/xml; charset=utf-8' 
  echo
 done

 #send the commit command to make sure all the changes are flushed and visible
 curl $URL --data-binary '<commit/>' -H 'Content-type:text/xml; charset=utf-8'
 echo
	This gist contains two files for simple indexing of PDF files.

	== requirements ==
	First you need to install Solr (which requires a Java JDK): Download a tar or zipfile at http://www.apache.org/dyn/closer.cgi/lucene/solr/ and unpack it to a directory of your choice. Go into this directory and start solr running in jetty by:

	$ cd example
	$ java -jar start.jar

	Then locate your browser to http://localhost:8983/solr/

	== data extraction ==
	Metadata and text is extracted from PDF files with 'xpdf'.
	An alternative to xpdf is PDFbox: http://pdfbox.apache.org/.
	You should also have a look at http://aperture.sourceforge.net/
	to extract (meta)data from PDF and other files but aperture seems to depend
	on half the internet (or I just don't like overblown Java frameworks).

	== usage ==
	Put pdf2solr.sh and html2solr.xsl in a directory of your choice and make it
	executable. Then just call pdf2solr.sh with one or more PDF files as arguments.

	== limitations ==
	The current version only indexes text, author and title fields.
	<?xml version="1.0" encoding="UTF-8"?>
	<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
	xmlns:m="http://example.com" exclude-result-prefixes="xsl m">

	<xsl:output method="xml" indent="yes" encoding="UTF-8"/>

	<mapping xmlns="http://example.com">
	<map meta="Author" field="author"/>
	<map meta="Title" field="title"/>
	</mapping>

	<xsl:param name="ID"/>
	<xsl:variable name="keepunknownfields" select="false()"/>
	<xsl:variable name="textfield" select="'text'"/>
	<xsl:variable name="mapping" select="document('')//m:map"/>

	<xsl:template match="/html">
	<add>
	<doc>
	<field name="id"><xsl:value-of select="$ID"/></field> <!-- TODO: ID must not be empty -->
	<xsl:for-each select="head/meta">
	<xsl:variable name="meta" select="@name"/>
	<xsl:variable name="map2" select="$mapping[@meta=$meta]/@field"/>
	<xsl:if test="$map2 or $keepunknownfields">
	<field>
	<xsl:attribute name="name">
	<xsl:if test="$map2"><xsl:value-of select="$map2"/></xsl:if>
	<xsl:if test="not($map2)"><xsl:value-of select="$meta"/></xsl:if>
	</xsl:attribute>
	<xsl:value-of select="@content"/>
	</field>
	</xsl:if>
	</xsl:for-each>
	<field name="{$textfield}">
	<xsl:value-of select="body"/>
	</field>
	</doc>
	</add>
	</xsl:template>

	</xsl:stylesheet>
	#!/bin/bash

	# simple PDF indexer for Solr

	FILES=$*
	TMPFILE=~tmp
	HTML2SOLR=html2solr.xsl
	URL=http://localhost:8983/solr/update

	for PDF in $FILES; do
	[ -r "$PDF" ] \|\| continue

	SHA=`shasum "$PDF" \|awk '{print $1}'`

	pdftotext -htmlmeta "$PDF" $TMPFILE.htm
	xmllint --xmlout --dropdtd --html $TMPFILE.htm 2> /dev/null > $TMPFILE.xhtml
	xsltproc --param ID "'"$SHA"'" $HTML2SOLR $TMPFILE.xhtml > $TMPFILE.xml
	# TODO: xsltproc may fail because of broken XML

	f="$TMPFILE.xml"
	echo "Posting file $f to $URL as $SHA"
	curl $URL --data-binary @$f -H 'Content-type:text/xml; charset=utf-8'
	echo
	done

	#send the commit command to make sure all the changes are flushed and visible
	curl $URL --data-binary '<commit/>' -H 'Content-type:text/xml; charset=utf-8'
	echo