Skip to content

Instantly share code, notes, and snippets.

@cbeer
Created March 13, 2010 20:57
Show Gist options
  • Save cbeer/331541 to your computer and use it in GitHub Desktop.
Save cbeer/331541 to your computer and use it in GitHub Desktop.
Mapping the NPR API schema to a solr index using the Data Import Handler URLDataSource
<dataConfig>
<dataSource name="nprapi" type="URLDataSource" encoding="UTF-8" connectionTimeout="5000" readTimeout="10000"/>
<document>
<entity name="nprapi"
pk="id"
url="http://api.npr.org/query?numResults=20&amp;fields=all&amp;output=nprml&amp;apiKey=[redacted]&amp;startNum=${dataimporter.request.startNum}"
processor="XPathEntityProcessor"
forEach="/nprml/list/story"
transformer="DateFormatTransformer">
<field column="id" xpath="/nprml/list/story/@id" />
<field column="link" xpath="/nprml/list/story/link[@type='html']" />
<field column="api" xpath="/nprml/list/story/link[@type='api']" />
<field column="title" xpath="/nprml/list/story/title" />
<field column="description" xpath="/nprml/list/story/teaser" />
<field column="author" xpath="/nprml/list/story/byline/name" />
<field column="organization" xpath="/nprml/list/story/organization/name" />
<field column="subject" xpath="/nprml/list/story/priorityKeywords" />
<field column="keywords" xpath="/nprml/list/story/keywords" />
<field column="category" xpath="/nprml/list/story/parent[@type='topic']/title" />
<field column="category_primary" xpath="/nprml/list/story/parent[@type='primaryTopic']/title" />
<field column="related" xpath="/nprml/list/story/relatedLink/link[@type='api']" />
<field column="show" xpath="/nprml/list/story/show/program" />
<field column="show_slug" xpath="/nprml/list/story/show/program/@code" />
<field column="extent" xpath="/nprml/list/story/audio/duration" />
<field column="media" xpath="/nprml/list/story/audio/format/mp3" />
<field column="thumbnail" xpath="/nprml/list/story/thumbnail/large" />
<field column="last_modified" xpath="/nprml/list/story/storyDate" dateTimeFormat="EEE, dd MMM yyyy HH:mm:ss Z" />
<field column="transcript" xpath="/nprml/list/story/text/paragraph" />
</entity>
</document>
</dataConfig>
<fields>
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="link" type="string" indexed="true" stored="true" />
<field name="api" type="string" indexed="true" stored="true" />
<field name="title" type="text" indexed="true" stored="true" required="true" />
<field name="description" type="text" indexed="true" stored="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="subject" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="keywords" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="category" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="category_primary" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="author" type="string" indexed="true" stored="true"/>
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="related" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="comments" type="text" indexed="true" stored="true"/>
<field name="show" type="string" indexed="true" stored="true" />
<field name="show_slug" type="string" indexed="true" stored="true" />
<field name="organization" type="string" indexed="true" stored="true" />
<field name="media" type="string" indexed="true" stored="true" />
<field name="extent" type="int" indexed="true" stored="true" />
<field name="transcript" type="text" indexed="true" stored="true" multiValued="true"/>
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text" indexed="true" stored="true" multiValued="true"/>
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
<field name="payloads" type="payloads" indexed="true" stored="true"/>
<!-- Uncommenting the following will create a "timestamp" field using
a default value of "NOW" to indicate when each document was indexed.
-->
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
</fields>
<!-- [...] -->
<copyField source="show" dest="show_t"/>
<copyField source="subject" dest="keywords"/>
<copyField source="author" dest="author_t"/>
<copyField source="payloads" dest="text"/>
<copyField source="transcript" dest="text" />
<copyField source="text" dest="text_rev" />
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment