Skip to content

Instantly share code, notes, and snippets.

@dungsaga
Last active December 11, 2024 05:30
Show Gist options
  • Save dungsaga/ea703c17c57f249c7d7d4346245f24f2 to your computer and use it in GitHub Desktop.
Save dungsaga/ea703c17c57f249c7d7d4346245f24f2 to your computer and use it in GitHub Desktop.
Salvage text from a broken pdf file
#!/usr/bin/env python3
import re, zlib, sys
def main(argv):
print("Salvage text from a broken pdf file")
print("Usage: salvage-pdf.py [<input_file.pdf> [<output_file.txt>]]")
pdf = argv[1] if len(argv) > 1 else sys.stdin.fileno()
input = open(pdf, "rb").read()
output = salvage_pdf(input)
txt = argv[2] if len(argv) > 2 else sys.stdout.fileno()
with open(txt, "wb") as file: file.write(output)
def salvage_pdf(input):
stream = re.compile(rb'/FlateDecode.*?stream(.*?)endstream', re.S)
streams = []
for s in stream.findall(input):
s = s.strip(b'\r\n')
try:
streams.append(zlib.decompress(s))
except:
pass # ignore decompression failures
charmap = extract_charmap(streams)
# print(charmap)
text = [extract_text(data, charmap) for data in streams]
output = [line for line in text if line] # remove empty lines
return b"\n\n".join(output)
# https://stackoverflow.com/questions/40036588/in-pdf-if-encoding-and-tounicode-are-both-present-in-pdf-how-to-map-the-text-e
def extract_charmap(streams):
charmap = {}
# TODO: add support for beginbfrange\n(.+)\nendbfrange
for data in streams:
m = re.search(b"beginbfchar\n(.+?)\nendbfchar", data, re.DOTALL)
if m:
for l in m.group(1).split(b"\n"):
pair = re.search(b'<(.+?)> *<(.+?)>',l)
charmap[pair.group(1)] = pair.group(2)
return charmap
# https://stackoverflow.com/questions/29467539/encoding-of-pdf-text-string
def extract_text(data, charmap):
if not re.search(rb'BT[ \n]', data): return b''
# extract lines ending with Tj or TJ
lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')]
text = [extract_line(line, charmap) for line in lines]
return b"\n".join(text)
def extract_line(line, charmap):
l = decode_line(line, charmap)
# l = re.sub(rb'\\222', '’'.encode('utf-8'), l)
# l = re.sub(rb'\\225', '•'.encode('utf-8'), l)
return re.sub(rb'\\([0-9]{3})', lambda m: num2char(m.group(1),8,'latin-1').decode('cp1252').encode('utf-8'), l)
def decode_line(line, charmap):
if re.search(b'<.+>Tj|\[<.+>\]TJ', line):
# remove everything except plain text inside brackets: <xxx>Tj or [<xxx>...<yyy>]TJ
l = re.sub(b'^.*?<|>Tj|>\]TJ|>.*?<',b'',line)
mapped = re.sub(b'.{4}', lambda m: (charmap.get(m.group()) or ''), l)
string = re.sub(b'.{4}', lambda m: num2char(m.group()), mapped)
return string
# remove everything except plain text inside brackets: (xxx)Tj or [(xxx)...(yyy)]TJ
l= re.sub(b'^.*?\(|\)Tj|\)\]TJ|\)[^)]*?\(', b'', line)
# remove escape for brackets
l= re.sub(rb'\\([\(\)])', rb'\1', l)
return l
def num2char(hex, base=16, encoding='utf-8'):
try:
return chr(int(hex, base)).encode(encoding)
except Exception as e:
print(hex)
print(e)
return b'?'
if __name__ == "__main__": main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment