Last active
December 11, 2024 05:30
-
-
Save dungsaga/ea703c17c57f249c7d7d4346245f24f2 to your computer and use it in GitHub Desktop.
Salvage text from a broken pdf file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re, zlib, sys | |
def main(argv): | |
print("Salvage text from a broken pdf file") | |
print("Usage: salvage-pdf.py [<input_file.pdf> [<output_file.txt>]]") | |
pdf = argv[1] if len(argv) > 1 else sys.stdin.fileno() | |
input = open(pdf, "rb").read() | |
output = salvage_pdf(input) | |
txt = argv[2] if len(argv) > 2 else sys.stdout.fileno() | |
with open(txt, "wb") as file: file.write(output) | |
def salvage_pdf(input): | |
stream = re.compile(rb'/FlateDecode.*?stream(.*?)endstream', re.S) | |
streams = [] | |
for s in stream.findall(input): | |
s = s.strip(b'\r\n') | |
try: | |
streams.append(zlib.decompress(s)) | |
except: | |
pass # ignore decompression failures | |
charmap = extract_charmap(streams) | |
# print(charmap) | |
text = [extract_text(data, charmap) for data in streams] | |
output = [line for line in text if line] # remove empty lines | |
return b"\n\n".join(output) | |
# https://stackoverflow.com/questions/40036588/in-pdf-if-encoding-and-tounicode-are-both-present-in-pdf-how-to-map-the-text-e | |
def extract_charmap(streams): | |
charmap = {} | |
# TODO: add support for beginbfrange\n(.+)\nendbfrange | |
for data in streams: | |
m = re.search(b"beginbfchar\n(.+?)\nendbfchar", data, re.DOTALL) | |
if m: | |
for l in m.group(1).split(b"\n"): | |
pair = re.search(b'<(.+?)> *<(.+?)>',l) | |
charmap[pair.group(1)] = pair.group(2) | |
return charmap | |
# https://stackoverflow.com/questions/29467539/encoding-of-pdf-text-string | |
def extract_text(data, charmap): | |
if not re.search(rb'BT[ \n]', data): return b'' | |
# extract lines ending with Tj or TJ | |
lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')] | |
text = [extract_line(line, charmap) for line in lines] | |
return b"\n".join(text) | |
def extract_line(line, charmap): | |
l = decode_line(line, charmap) | |
# l = re.sub(rb'\\222', '’'.encode('utf-8'), l) | |
# l = re.sub(rb'\\225', '•'.encode('utf-8'), l) | |
return re.sub(rb'\\([0-9]{3})', lambda m: num2char(m.group(1),8,'latin-1').decode('cp1252').encode('utf-8'), l) | |
def decode_line(line, charmap): | |
if re.search(b'<.+>Tj|\[<.+>\]TJ', line): | |
# remove everything except plain text inside brackets: <xxx>Tj or [<xxx>...<yyy>]TJ | |
l = re.sub(b'^.*?<|>Tj|>\]TJ|>.*?<',b'',line) | |
mapped = re.sub(b'.{4}', lambda m: (charmap.get(m.group()) or ''), l) | |
string = re.sub(b'.{4}', lambda m: num2char(m.group()), mapped) | |
return string | |
# remove everything except plain text inside brackets: (xxx)Tj or [(xxx)...(yyy)]TJ | |
l= re.sub(b'^.*?\(|\)Tj|\)\]TJ|\)[^)]*?\(', b'', line) | |
# remove escape for brackets | |
l= re.sub(rb'\\([\(\)])', rb'\1', l) | |
return l | |
def num2char(hex, base=16, encoding='utf-8'): | |
try: | |
return chr(int(hex, base)).encode(encoding) | |
except Exception as e: | |
print(hex) | |
print(e) | |
return b'?' | |
if __name__ == "__main__": main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment