-
-
Save terencezl/61fe3f28c44a763dd1e9f060b8ff6f2e to your computer and use it in GitHub Desktop.
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
from io import BytesIO | |
def convert_pdf(path, format='text', codec='utf-8', password=''): | |
rsrcmgr = PDFResourceManager() | |
retstr = BytesIO() | |
laparams = LAParams() | |
if format == 'text': | |
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
elif format == 'html': | |
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
elif format == 'xml': | |
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
else: | |
raise ValueError('provide format, either text, html or xml!') | |
fp = open(path, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
maxpages = 0 | |
caching = True | |
pagenos=set() | |
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): | |
interpreter.process_page(page) | |
text = retstr.getvalue().decode() | |
fp.close() | |
device.close() | |
retstr.close() | |
return text |
init() got an unexpected keyword arguement 'codec' ... ??
EDIT : class TextConverter(PDFConverter)
does not take codec arguement
init() got an unexpected keyword arguement 'codec' ... ??
EDIT :
class TextConverter(PDFConverter)
does not take codec arguement
def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None,
showpageno=False, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams)
while running above code getting this error
Traceback (most recent call last):
File "C:/Users//AppData/Roaming/JetBrains/PyCharmCE2021.1/scratches/scratch_10.py", line 41, in
if name == main():
File "C:/Users//AppData/Roaming/JetBrains/PyCharmCE2021.1/scratches/scratch_10.py", line 38, in main
out = convert_pdf(fileName, codec)
File "C:/Users//AppData/Roaming/JetBrains/PyCharmCE2021.1/scratches/scratch_10.py", line 18, in convert_pdf
device = XMLConverter(rsrcmgr, retstr, laparams=laparams)
File "C:\Users*\PycharmProjects\gcs-authoring_authoring-service\venv\lib\site-packages\pdfminer\converter.py", line 407, in init
self.write_header()
File "C:\Users*****\PycharmProjects\gcs-authoring_authoring-service\venv\lib\site-packages\pdfminer\converter.py", line 411, in write_header
self.outfp.write('\n')
TypeError: a bytes-like object is required, not 'str'
nice! thanks for that!