Last active
June 20, 2019 08:02
-
-
Save benzkji/6726fb52cad9824809687fa2a824a3c5 to your computer and use it in GitHub Desktop.
filer haystack pdf integration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# full of prints, forgive me ;-) | |
# coding: utf-8 | |
from __future__ import unicode_literals | |
import os | |
from pdfminer.pdfdocument import PDFEncryptionError | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
from StringIO import StringIO | |
from haystack import indexes | |
from filer.models import File | |
class PDFIndex(indexes.SearchIndex, indexes.Indexable): | |
text = indexes.CharField(document=True) | |
title = indexes.CharField() | |
type = indexes.CharField() | |
url = indexes.CharField() | |
counter = 0 | |
# target = indexes.CharField() | |
def get_model(self): | |
return File | |
def index_queryset(self, using=None): | |
"""Used when the entire index for model is updated.""" | |
return self.get_model().objects.filter(file__endswith='.pdf', ) | |
# original_filename__contains="Geschäftsbericht") | |
def get_updated_field(self): | |
return "modified_at" | |
def prepare_text(self, object): | |
self.counter += 1 | |
print "-- %s ---------------------" % self.counter | |
print object | |
print object.url | |
text = convert_pdf_to_txt(object.path) | |
document = "%s %s %s" % (self.prepare_title(object), self.prepare_url(object), text) | |
print "---" | |
# print document | |
return document | |
def prepare_title(self, object): | |
title = object.original_filename | |
# print title | |
return title | |
def prepare_type(self, object): | |
return 'PDF' | |
def prepare_url(self, object): | |
url = object.url | |
# print url | |
return url | |
def convert_pdf_to_txt(path): | |
# return "pdf test" | |
if not os.path.isfile(path): | |
print "file not existing: %s" % path | |
return '' | |
rsrcmgr = PDFResourceManager() | |
retstr = StringIO() | |
codec = 'utf-8' | |
laparams = LAParams() | |
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
fp = file(path, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
password = "" | |
maxpages = 0 | |
caching = True | |
pagenos = set() | |
try: | |
pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, | |
caching=caching, check_extractable=True) | |
for page in pages: | |
interpreter.process_page(page) | |
except UnicodeDecodeError: | |
print "ENCRYPTED PDF DETECTED (probably)!" | |
return '' | |
except PDFEncryptionError: | |
print "UNKNOWN ENCRYPTION DETECTED" | |
return '' | |
text = retstr.getvalue() | |
text = unicode(text, 'utf-8') | |
fp.close() | |
device.close() | |
retstr.close() | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment