emlyn · June 25, 2024 22:21
diff --git a/wordextract.py b/wordextract.py
 #!/usr/bin/env python3

 # Exatrct all embedded images from a Word document in their full resolution.
 # In Word: File, Save As, choose XML format
 # Then run the resulting Word XML file through this script.

 import sys
 import base64
 from xml.dom.minidom import parse


 def main(fname):
    dom = parse(fname)
    for i, part in enumerate(dom.getElementsByTagName('pkg:part')):
        fname = part.getAttribute('pkg:name')
        ctype = part.getAttribute('pkg:contentType')
        print(f"\nPart {i+1}: {fname} ({ctype})")
        if not ctype.startswith('image/'):
            print('Skipping (not an image)')
        elif comp := part.getAttribute('pkg:compression') != 'store':
            print(f'Skipping (compressed: {comp})')
        else:
            fname = fname.lstrip('/').replace('/', '_')
            print(f'Extracting to {fname}')
            data = part.getElementsByTagName('pkg:binaryData')[0].firstChild.data
            with open(fname, 'wb') as f:
                f.write(base64.b64decode(data))

                
 if __name__ == '__main__':
    main(*sys.argv[1:])
	#!/usr/bin/env python3

	# Exatrct all embedded images from a Word document in their full resolution.
	# In Word: File, Save As, choose XML format
	# Then run the resulting Word XML file through this script.

	import sys
	import base64
	from xml.dom.minidom import parse


	def main(fname):
	dom = parse(fname)
	for i, part in enumerate(dom.getElementsByTagName('pkg:part')):
	fname = part.getAttribute('pkg:name')
	ctype = part.getAttribute('pkg:contentType')
	print(f"\nPart {i+1}: {fname} ({ctype})")
	if not ctype.startswith('image/'):
	print('Skipping (not an image)')
	elif comp := part.getAttribute('pkg:compression') != 'store':
	print(f'Skipping (compressed: {comp})')
	else:
	fname = fname.lstrip('/').replace('/', '_')
	print(f'Extracting to {fname}')
	data = part.getElementsByTagName('pkg:binaryData')[0].firstChild.data
	with open(fname, 'wb') as f:
	f.write(base64.b64decode(data))


	if __name__ == '__main__':
	main(*sys.argv[1:])
No results found