Created
August 16, 2020 16:54
-
-
Save yashkumaratri/222c750b1ff4cf883eca00b0b839ff0d to your computer and use it in GitHub Desktop.
PDFMINER no spaces between words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#https://stackoverflow.com/questions/49457443/python-pdfminer-converts-pdf-file-into-one-chunk-of-string-with-no-spaces-betwee | |
# | |
#According to this thread some pdfs mark the entire text as figure and by default PDFMiner doesn't try to perform layout analysis for figure text. To override this behavior the all_texts parameter needs to be set to True. | |
# | |
import io | |
import pdfminer | |
from pdfminer.converter import TextConverter | |
from pdfminer.pdfinterp import PDFPageInterpreter | |
from pdfminer.pdfinterp import PDFResourceManager | |
from pdfminer.pdfpage import PDFPage | |
# Perform layout analysis for all text | |
laparams = pdfminer.layout.LAParams() | |
setattr(laparams, 'all_texts', True) | |
def extract_text_from_pdf(pdf_path): | |
resource_manager = PDFResourceManager() | |
fake_file_handle = io.StringIO() | |
converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams) | |
page_interpreter = PDFPageInterpreter(resource_manager, converter) | |
with open(pdf_path, 'rb') as fh: | |
for page in PDFPage.get_pages(fh, | |
caching=True, | |
check_extractable=True): | |
page_interpreter.process_page(page) | |
text = fake_file_handle.getvalue() | |
# close open handles | |
converter.close() | |
fake_file_handle.close() | |
if text: | |
return text | |
text = extract_text_from_pdf('test.pdf') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment