Last active
August 29, 2015 14:23
-
-
Save winhamwr/9dbf16a5a73759e35c43 to your computer and use it in GitHub Desktop.
pydocx image handler for resizing images to their displayed size
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import base64 | |
import logging | |
import cgi | |
import logging | |
import os | |
import posixpath | |
import re | |
import subprocess | |
import time | |
from tempfile import NamedTemporaryFile | |
from urlparse import unquote, urlparse | |
from StringIO import StringIO | |
from PIL import Image | |
from pydocx.export import PyDocXHTMLExporter | |
from pstat.misc.get_image_from_src import get_image_from_src, is_encoded_image | |
from pstat.misc.replace_extension import replace_extension | |
IMAGE_EXTENSIONS_TO_SKIP = ['emf', 'wmf', 'svg'] | |
IMAGE_FORMATS_TO_GIF_COMPRESS = ['BMP', 'TIFF'] | |
logger = logging.getLogger('pstat.misc.image') | |
class PstatDocx2Html(PyDocXHTMLExporter): | |
def image( | |
self, | |
image_data, | |
filename, | |
x, | |
y, | |
uri_is_external, | |
*args, | |
**kwargs): | |
if uri_is_external: | |
image_data, filename = get_image_data_and_filename( | |
image_data, | |
filename, | |
) | |
pstat_image = PstatImage(image_data, filename, x, y) | |
if pstat_image.has_skipable_extension(): | |
return '' | |
if not pstat_image.has_height_and_width(): | |
return '' | |
pstat_image.prime_image() | |
pstat_image.resize_image() | |
pstat_image.update_filename() | |
return super( | |
PstatDocx2Html, | |
self, | |
).image( | |
pstat_image.image_data, | |
pstat_image.filename, | |
pstat_image.x, | |
pstat_image.y, | |
uri_is_external, | |
*args, **kwargs) | |
class PstatImage(object): | |
def __init__(self, image_data, filename, x, y): | |
self.image_data = image_data | |
self.filename = filename | |
self.x = self._get_dimension(x) | |
self.y = self._get_dimension(y) | |
self.image_format = None | |
self.image = None | |
def has_skipable_extension(self): | |
if not self.filename: | |
return False | |
lower_src = self.filename.lower() | |
extension = lower_src.rsplit('.')[-1] | |
return extension in IMAGE_EXTENSIONS_TO_SKIP | |
def has_height_and_width(self): | |
return self.x and self.y | |
def _get_dimension(self, dim): | |
if not dim: | |
return 0 | |
try: | |
return int(dim.strip('px')) | |
except ValueError: | |
logger.warning('Unable to convert size: "%s"', dim) | |
return 0 | |
def prime_image(self): | |
image_data = self.image_data | |
match = is_encoded_image(image_data) | |
if match: | |
image_data = base64.b64decode(match.group('image_data')) | |
try: | |
self.image = Image.open(StringIO(image_data)) | |
except (IOError, SystemError): | |
# PIL can't open it, return the image_data as is. | |
logger.warning('Not able to open image') | |
def resize_image(self): | |
# Let's not resize a base64 encoded image. | |
if is_encoded_image(self.image_data): | |
return | |
if not self.image: | |
return | |
image_format = self.image.format | |
self.image_format = image_format | |
expected_sizes = (self.x, self.y) | |
current_area = self.x * self.y | |
new_x, new_y = self.image.size | |
new_area = new_x * new_y | |
# We don't ever want to resize an image and it be larger than the | |
# original. As such count the before and after pixels (area) and | |
# compare. | |
if (current_area < new_area) and (expected_sizes != self.image.size): | |
try: | |
self.image = self.image.resize(expected_sizes, Image.ANTIALIAS) | |
except (IOError, SystemError): | |
# Image can't be resized, such is life. | |
logger.warning('Unable to resize') | |
if image_format in IMAGE_FORMATS_TO_GIF_COMPRESS: | |
# Convert to gif. | |
image_format = 'GIF' | |
output = StringIO() | |
try: | |
self.image.save(output, image_format) | |
self.image_data = output.getvalue() | |
except (IOError, SystemError): | |
# PIL can't save this image. | |
logger.warning('Unable to save image') | |
self.image_format = image_format | |
def update_filename(self): | |
if not self.image_format: | |
return | |
if not self.filename: | |
return | |
self.filename = replace_extension( | |
self.filename, | |
self.image_format.lower(), | |
) | |
def get_image_data_and_filename(image_data, filename): | |
""" | |
If the image is an external image then the image_data is actually a link to | |
the image and the filename is likely garbage. | |
""" | |
parsed_url = urlparse(image_data) | |
_, real_filename = posixpath.split(parsed_url.path) | |
match = is_encoded_image(image_data) | |
sanitized_filename = None | |
if not match: | |
sanitized_filename = sanitize_filename(real_filename) | |
real_image_data = get_image_from_src(image_data) | |
if real_image_data is None: | |
return image_data, filename | |
return real_image_data, sanitized_filename | |
def sanitize_filename(filename): | |
""" | |
When we create attachments from pydocx we usually add a timestamp followed | |
by a dash (-) to make the image unique for round-tripping. In an effort to | |
prevent a bunch of timestamps preceding the image name (in the event a | |
document is round-tripped several times), strip off the timestamp | |
and dash. When images come from docx they are always `image\d+`. We only | |
want to strip off the timestamp and dash if they were progromatically | |
added. | |
>>> sanitize_filename('1409764011-image1.gif') | |
'image1.gif' | |
>>> sanitize_filename('409764011-image1.gif') | |
'409764011-image1.gif' | |
>>> sanitize_filename('1409764011-image.gif') | |
'1409764011-image.gif' | |
>>> sanitize_filename('image%20%232014.gif') | |
'image #2014.gif' | |
""" | |
# (timestamp)-image(image_number).(file_extension) | |
regex = re.compile(r'\d{10}-image\d+\.\w{3,4}') | |
if regex.match(filename): | |
_, filename = filename.rsplit('-', 1) | |
return unquote(filename) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import re | |
import requests | |
from requests.exceptions import InvalidSchema | |
data_uri_regex = re.compile( | |
r'data:image/(?P<extension>\w+);base64,(?P<image_data>.+)', | |
) | |
def is_encoded_image(image_data): | |
return data_uri_regex.match(image_data) | |
def get_image_from_src(src): | |
''' | |
Take a src attribute from an image tag and return the content image data | |
associated with that image. At the minimum we should handle https:// and | |
base64 encoded images. | |
''' | |
# Handle the easy case first, its an external link to somewhere else. | |
try: | |
response = requests.get(src) | |
except InvalidSchema: | |
pass | |
else: | |
return response.content | |
# Check to see if it's a base64 encoded image. | |
match = is_encoded_image(src) | |
if match: | |
return src | |
# Not really sure what is going on here, punt for now. | |
return src |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import os | |
def replace_extension(file_path, new_ext): | |
""" | |
>>> replace_extension('one/two/three.four.doc', '.html') | |
'one/two/three.four.html' | |
>>> replace_extension('one/two/three.four.DOC', '.html') | |
'one/two/three.four.html' | |
>>> replace_extension('one/two/three.four.DOC', 'html') | |
'one/two/three.four.html' | |
""" | |
if not new_ext.startswith(os.extsep): | |
new_ext = os.extsep + new_ext | |
index = file_path.rfind(os.extsep) | |
return file_path[:index] + new_ext |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment