Skip to content

Instantly share code, notes, and snippets.

@kzinmr
Last active July 4, 2019 09:26
Show Gist options
  • Save kzinmr/aab8c61c19c7414374015531eb85c68f to your computer and use it in GitHub Desktop.
Save kzinmr/aab8c61c19c7414374015531eb85c68f to your computer and use it in GitHub Desktop.
extract text from pdf (exported from .ppt, .doc) using pdfminer(https://github.com/pdfminer/pdfminer.six)
import pickle
import subprocess
from pathlib import Path
import sys
from lxml import etree
def extract_textbox_slide(page):
"""
1-textbox = 1-passageとしてパッセージおよびそのサイズを抽出
最大文字サイズであることを手掛かりにタイトルであることを近似
"""
passages = []
_id = int(page.attrib['id'])
for textbox in page:
# textbox ~ passage
if textbox.tag == 'textbox':
passage = ''.join(''.join(
[text.text for text in textline
if text.text is not None and text.attrib])
for textline in textbox)
max_size = max(
(max([text.attrib['size'] for text in textline if text.attrib])
for textline in textbox))
passage_d = {
'size': float(max_size),
'passage': passage
}
passages.append(passage_d)
if passages:
max_passage = max(passages, key=lambda x: x['size'])['passage']
body = '\n'.join([p['passage'] for p in passages])
return {
'page': _id,
'title': max_passage,
'body': body,
'passages': passages
}
else:
return {
'page': _id,
'title': '',
'body': '',
'passages': ''
}
def convert_pdf2xml2dict(filename, outdir):
print(filename)
p = Path(filename)
stem = p.stem
# pdf -> xml
xmlname = f"{outdir}/{stem}.xml"
# this command is available via `pip install pdfminer.six chardet`
subprocess.run(["pdf2txt.py", "-t", "xml", "-o", xmlname, filename])
with open(xmlname) as f:
xmlstring = f.read().encode('utf-8')
# xml -> dict(pkl)
parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
tree = etree.fromstring(xmlstring, parser=parser) # ET.parse(xmlname)
pages = tree.getroottree().getroot()
data_dict = {
page.attrib['id']: extract_textbox_slide(page)
for page in pages
}
for v in data_dict.values():
v['filename'] = stem
with open(f'{outdir}/{stem}.pkl', 'wb') as f:
pickle.dump(data_dict, f)
if __name__ == '__main__':
assert len(sys.argv) == 3
dirname = sys.argv[1] # directory which contains input pdf files
outdir = sys.argv[2] # directory where output xml and pkl files are exported
p = Path(dirname)
for filename in p.glob('*.pdf'):
stem = Path(filename).stem
p = Path(f'{outdir}/{stem}.pkl')
if not p.exists():
convert_pdf2xml2dict(filename, outdir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment