Created
December 8, 2015 07:15
-
-
Save benzkji/e6f6ab28e65560856bfb to your computer and use it in GitHub Desktop.
haystack index for django-filer PDFs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from __future__ import unicode_literals | |
import os | |
from pdfminer.pdfdocument import PDFEncryptionError | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
from cStringIO import StringIO | |
from haystack import indexes | |
from filer.models import File | |
class PDFIndex(indexes.SearchIndex, indexes.Indexable): | |
text = indexes.CharField(document=True) | |
title = indexes.CharField() | |
url = indexes.CharField() | |
counter = 0 | |
# target = indexes.CharField() | |
def get_model(self): | |
return File | |
def index_queryset(self, using=None): | |
"""Used when the entire index for model is updated.""" | |
return self.get_model().objects.filter(file__endswith='.pdf', ) | |
# original_filename__contains="Geschäftsbericht") | |
def get_updated_field(self): | |
return "modified_at" | |
def prepare_text(self, object): | |
self.counter += 1 | |
print "--- %s ---------------------" % self.counter | |
print object | |
print object.url | |
text = convert_pdf_to_txt(object.path) | |
document = "%s %s %s" % (self.prepare_title(object), self.prepare_url(object), text) | |
print "--- ok -----" | |
# print document | |
return document | |
def prepare_title(self, object): | |
title = object.file.name | |
# print title | |
return title | |
def prepare_url(self, object): | |
url = object.url | |
# print url | |
return url | |
def convert_pdf_to_txt(path): | |
if not os.path.isfile(path): | |
print "file not existing: %s" % path | |
return '' | |
rsrcmgr = PDFResourceManager() | |
retstr = StringIO() | |
codec = 'utf-8' | |
laparams = LAParams() | |
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
fp = file(path, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
password = "" | |
maxpages = 0 | |
caching = True | |
pagenos = set() | |
try: | |
pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, | |
caching=caching, check_extractable=True) | |
for page in pages: | |
interpreter.process_page(page) | |
except UnicodeDecodeError: | |
print "ENCRYPTED PDF DETECTED (probably, but no unknown encryption)!" | |
return '' | |
except PDFEncryptionError: | |
print "UNKNOWN ENCRYPTION DETECTED" | |
return '' | |
text = retstr.getvalue() | |
text = unicode(text, 'utf-8') | |
fp.close() | |
device.close() | |
retstr.close() | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment