Last active
July 4, 2019 09:26
-
-
Save kzinmr/aab8c61c19c7414374015531eb85c68f to your computer and use it in GitHub Desktop.
extract text from pdf (exported from .ppt, .doc) using pdfminer(https://github.com/pdfminer/pdfminer.six)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import subprocess | |
from pathlib import Path | |
import sys | |
from lxml import etree | |
def extract_textbox_slide(page): | |
""" | |
1-textbox = 1-passageとしてパッセージおよびそのサイズを抽出 | |
最大文字サイズであることを手掛かりにタイトルであることを近似 | |
""" | |
passages = [] | |
_id = int(page.attrib['id']) | |
for textbox in page: | |
# textbox ~ passage | |
if textbox.tag == 'textbox': | |
passage = ''.join(''.join( | |
[text.text for text in textline | |
if text.text is not None and text.attrib]) | |
for textline in textbox) | |
max_size = max( | |
(max([text.attrib['size'] for text in textline if text.attrib]) | |
for textline in textbox)) | |
passage_d = { | |
'size': float(max_size), | |
'passage': passage | |
} | |
passages.append(passage_d) | |
if passages: | |
max_passage = max(passages, key=lambda x: x['size'])['passage'] | |
body = '\n'.join([p['passage'] for p in passages]) | |
return { | |
'page': _id, | |
'title': max_passage, | |
'body': body, | |
'passages': passages | |
} | |
else: | |
return { | |
'page': _id, | |
'title': '', | |
'body': '', | |
'passages': '' | |
} | |
def convert_pdf2xml2dict(filename, outdir): | |
print(filename) | |
p = Path(filename) | |
stem = p.stem | |
# pdf -> xml | |
xmlname = f"{outdir}/{stem}.xml" | |
# this command is available via `pip install pdfminer.six chardet` | |
subprocess.run(["pdf2txt.py", "-t", "xml", "-o", xmlname, filename]) | |
with open(xmlname) as f: | |
xmlstring = f.read().encode('utf-8') | |
# xml -> dict(pkl) | |
parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') | |
tree = etree.fromstring(xmlstring, parser=parser) # ET.parse(xmlname) | |
pages = tree.getroottree().getroot() | |
data_dict = { | |
page.attrib['id']: extract_textbox_slide(page) | |
for page in pages | |
} | |
for v in data_dict.values(): | |
v['filename'] = stem | |
with open(f'{outdir}/{stem}.pkl', 'wb') as f: | |
pickle.dump(data_dict, f) | |
if __name__ == '__main__': | |
assert len(sys.argv) == 3 | |
dirname = sys.argv[1] # directory which contains input pdf files | |
outdir = sys.argv[2] # directory where output xml and pkl files are exported | |
p = Path(dirname) | |
for filename in p.glob('*.pdf'): | |
stem = Path(filename).stem | |
p = Path(f'{outdir}/{stem}.pkl') | |
if not p.exists(): | |
convert_pdf2xml2dict(filename, outdir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment