-
-
Save stevehanson/7461706 to your computer and use it in GitHub Desktop.
import os | |
import sys | |
# constants, configure to match your environment | |
HOST = 'http://localhost:9200' | |
INDEX = 'test' | |
TYPE = 'attachment' | |
TMP_FILE_NAME = 'tmp.json' | |
def main(): | |
if len(sys.argv) < 2: | |
print 'No filename provided.\nUsage: "python es-attach.py filename".\nExiting...' | |
exit() | |
fname = sys.argv[1] | |
createEncodedTempFile(fname) | |
createIndexIfDoesntExist() | |
postFileToTheIndex() | |
os.remove(TMP_FILE_NAME) | |
def postFileToTheIndex(): | |
cmd = 'curl -X POST "{}/{}/{}" -d @'.format(HOST,INDEX,TYPE) + TMP_FILE_NAME | |
print cmd | |
os.system(cmd) | |
def createEncodedTempFile(fname): | |
import json | |
file64 = open(fname, "rb").read().encode("base64") | |
print 'writing JSON with base64 encoded file to temp file {}'.format(TMP_FILE_NAME) | |
f = open(TMP_FILE_NAME, 'w') | |
data = { 'file': file64, 'title': fname } | |
json.dump(data, f) # dump json to tmp file | |
f.close() | |
def createIndexIfDoesntExist(): | |
import urllib2 | |
class HeadRequest(urllib2.Request): | |
def get_method(self): | |
return "HEAD" | |
# check if type exists by sending HEAD request to index | |
try: | |
urllib2.urlopen(HeadRequest(HOST + '/' + INDEX + '/' + TYPE)) | |
except urllib2.HTTPError, e: | |
if e.code == 404: | |
print 'Index doesnt exist, creating...' | |
os.system('curl -X PUT "{}/{}/{}/_mapping" -d'.format(HOST,INDEX,TYPE) + ''' '{ | |
"attachment" : { | |
"properties" : { | |
"file" : { | |
"type" : "attachment", | |
"fields" : { | |
"title" : { "store" : "yes" }, | |
"file" : { "term_vector":"with_positions_offsets", "store":"yes" } | |
} | |
} | |
} | |
} | |
}' ''') | |
else: | |
print 'Failed to retrieve index with error code - %s.' % e.code | |
# kick off the main function when script loads | |
main() |
@stevehanson Thanks for your example. I ran your script against a PDF. When I query, I see this (only showing part) in the 'file' field. Is it possible to view this in plain english? I ran it through an online base64 decoder and some of the PDF header was readable, but the rest of the text still seems to be encoded in some way. Any thoughts on what I might be doing wrong? Thanks v. much.
"file": "JVBERi0xLjYNJeLjz9MNCjY0ODMgMCBvYmoNPDwvTGluZWFyaXplZCAxL0wgNDMyNjI0L08gNjQ4\nNS9FIDkxNDY4L04gMTQvVCA0MzE4MDgvSCBbIDUwMyAzMzVdPj4NZW5kb2JqDSAgICAgICAgICAg\nDQo2NDk5IDAgb2JqDTw8L0RlY29kZVBhcm1zPDwvQ29sdW1ucyA1L1ByZWRpY3RvciAxMj4+L0Zp\nbHRlci9GbGF0ZURlY29kZS9JRFs8MjREMTc0M0E4MUFCMUY0Q0IyQjNGQjBDQ0I4QjgwMDA+PDZB\nMTIyMUNBODY2NzJDNDZBNUU1N0NGOEQxQjJENEQ4Pl0vSW5kZXhbNjQ4MyAzNl0vSW5mbyA2NDgy\nIDAgUi9MZW5ndGggODkvUHJldiA0MzE4MDkvUm9vdCA2NDg0IDAgUi9TaXplID
@Analect you ever figure this out? I have similar results.
@rboyd, @Analect: Did you installed https://github.com/elastic/elasticsearch-mapper-attachments ?
@Analect, did you figure this out? It appears that elasicsearch-mapper-attachments only looks at the first 100,000 chars for indexing. It is possible you are missing parts of the PDF if it is larger than that.
@stevehanson can you please explain the that you have used..I am a bit confused.
Also made a more full-featured script for indexing directories recursively: https://gist.github.com/stevehanson/7462063