Created
March 9, 2011 15:32
-
-
Save cburgmer/862398 to your computer and use it in GitHub Desktop.
Extracting a file with Solr & Tika for indexing using Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import urllib2 | |
from poster.encode import multipart_encode | |
from poster.streaminghttp import register_openers | |
# Register poster to urllib2 | |
register_openers() | |
SOLR_URL = 'http://localhost:8080/solr/' | |
filepath = '/tmp/test.pdf' | |
datagen, headers = multipart_encode({'extractOnly': 'true', | |
'wt': 'json', | |
basename: open(filepath, "rb")}) | |
request = urllib2.Request(SOLR_URL.rstrip('/') | |
+ "/update/extract", | |
datagen, headers) | |
p = urllib2.urlopen(request) | |
if p.getcode() != 200: | |
raise Exception("Error reading file %r with solr" % filepath) | |
result = json.loads(p.read()) | |
if filepath not in result: | |
raise Exception("Solr error parsing file content %r: %r" % (filepath, result)) | |
content = result[filepath] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment