Last active
May 4, 2016 09:38
-
-
Save mitchellrj/8472092 to your computer and use it in GitHub Desktop.
CSS inliner
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Take an HTML document and replace all CSS with inline styles, accounting | |
for all precedence rules. Requires cssutils, cssselect and lxml.Does not | |
work with pseudo-elements, @font-face, @page and CSS variables as these | |
cannot be represented by inline CSS. | |
Usage:: | |
inline_css(html_string, page_url, medium) | |
Where ``medium`` is of the type:: | |
{ | |
'type': 'screen', | |
'width': '1024', | |
'height': '768', | |
'orientation': 'landscape', | |
'device-width': '2048', | |
'device-height': '1024', | |
'aspect-ratio': '4/3', | |
'device-aspect-ratio': '2/1', | |
'color': '24', | |
'color-index': '256', | |
'monochrome': '0', | |
'grid': False, | |
'scan': '', | |
'resolution': '300dpi', | |
'pixel-ratio': '1' | |
} | |
""" | |
from collections import OrderedDict | |
import fractions | |
import logging | |
import re | |
import urlparse | |
try: | |
from bs4 import BeautifulSoup | |
except ImportError: | |
from BeautifulSoup import BeautifulSoup | |
import cssutils | |
from lxml import etree | |
from lxml.cssselect import CSSSelector | |
from lxml.html import soupparser | |
logger = logging.getLogger(__name__) | |
def _compare_sheets_with_meta(sheet1, sheet2): | |
if sheet1[1]['embedded'] is True and sheet2[1]['embedded'] is False: | |
return 1 | |
elif sheet1[1]['embedded'] is False and sheet2[1]['embedded'] is True: | |
return -1 | |
return cmp(sheet1[1]['order'], sheet2[1]['order']) | |
def full_path(url, doc_url): | |
base_url = doc_url.rsplit('/', 1)[0] | |
if not urlparse.urlparse(url).netloc: | |
return urlparse.urljoin(base_url, url) | |
return url | |
def _add_sheet(sheet, sheets_with_meta, n, embedded): | |
import_rules = sheet.cssRules.rulesOfType(cssutils.css.CSSRule.IMPORT_RULE) | |
for rule in import_rules: | |
subsheet = rule.styleSheet | |
n = _add_sheet(subsheet, sheets_with_meta, n, False) | |
sheets_with_meta[sheet] = {'order': n, 'embedded': embedded} | |
n += 1 | |
return n | |
def selectors_without_pseudo(style_rule): | |
non_pseudo_selectors = [] | |
for selector in style_rule.selectorList: | |
if ':' not in selector.selectorText: | |
non_pseudo_selectors.append(selector) | |
return non_pseudo_selectors | |
def parse_media_query(media_query): | |
class Grouping(object): | |
def __init__(self, parent, *args): | |
self.parent = parent | |
self.children = list(args) | |
def add(self, arg): | |
self.children.add(arg) | |
def matches(self, test_medium): | |
return self.children[0].matches(test_medium) | |
def __str__(self): | |
if not self.children: | |
return '' | |
return '({child})'.format(child=self.children[0]) | |
class MediaExpression(object): | |
pass | |
class MediaType(MediaExpression): | |
def __init__(self, parent, value): | |
self.parent = parent | |
self.value = value | |
def matches(self, test_medium): | |
return self.value == 'all' or self.value == test_medium['type'] | |
def __str__(self): | |
return self.value | |
class MediaFeature(MediaExpression): | |
def __init__(self, parent, name, value=None): | |
self.parent = parent | |
self.name = name | |
self.set_value(value) | |
def set_value(self, value): | |
if value is None: | |
self.value = None | |
self.units = None | |
else: | |
self.value = self.convert(value) | |
self.units = self.get_units(value) | |
def get_units(self, value): | |
return re.sub(r'[0-9]', '', value.lower().strip()) | |
def convert(self, value): | |
return int(re.sub(r'[^0-9]', '', value.lower().strip())) | |
def matches(self, test_medium): | |
if self.value is None: | |
return bool(test_medium[self.name]) | |
return self.value == self.convert(test_medium[self.name]) | |
def __str__(self): | |
if self.value is None: | |
return self.name | |
return '{name}: {value}{units}'.format( | |
name=self.name, | |
value=self.value, | |
units=(self.units or '') | |
) | |
class OrientationFeature(MediaFeature): | |
def __init__(self, parent, value): | |
self.parent = parent | |
self.value = value | |
def set_value(self, value): | |
if value is None: | |
self.value = None | |
elif value.lower() in ('portrait', 'landscape'): | |
self.value = value.lower() | |
else: | |
self.value = None | |
self.units = None | |
def matches(self, test_medium): | |
if (self.value, test_medium['orientation']) == (None, None): | |
return True | |
return self.value.lower() == test_medium['orientation'].lower() | |
def __str__(self): | |
return self.value | |
class ScanMediaFeature(MediaFeature): | |
def get_units(self, value): | |
return None | |
def convert(self, value): | |
return value | |
class MinMaxMediaFeature(MediaFeature): | |
def matches(self, test_medium): | |
if self.name.startswith('min-'): | |
return self <= self.convert(test_medium[self.name[4:]]) | |
elif self.name.startswith('max-'): | |
return self >= self.convert(test_medium[self.name[4:]]) | |
else: | |
return super(MediaFeature, self).matches(test_medium) | |
def __ge__(self, test_value): | |
return self.value >= test_value | |
def __le__(self, test_value): | |
return self.value <= test_value | |
def __eq__(self, test_value): | |
return self.value == test_value | |
class UnknownMediaFeature(MediaFeature): | |
def convert(self, value): | |
return value | |
def set_value(self, value): | |
self.value = value | |
self.units = None | |
def matches(self, test_medium): | |
return True | |
class AspectRatioMediaFeature(MinMaxMediaFeature): | |
def convert(self, value): | |
width, height = value.strip().split('/', 1) | |
return int(width) / int(height) | |
def get_units(self, value): | |
return None | |
def __str__(self): | |
fraction = fractions.Fraction(self.value) | |
return '{name}: {value}'.format( | |
name=self.name, | |
value=fraction, | |
) | |
class ResolutionMediaFeature(MinMaxMediaFeature): | |
CM_IN_INCH = 2.54 | |
def convert(self, value): | |
int_value = super(MinMaxMediaFeature, self).convert(value) | |
if self.get_units(value) == 'dpcm': | |
int_value = int_value * self.CM_IN_INCH | |
return int_value | |
class Whitespace(object): | |
def __init__(self, whitespace): | |
pass | |
class Value(object): | |
def __init__(self, value): | |
pass | |
class And(Grouping): | |
def matches(self, test_medium): | |
for child in self.children: | |
if not child.matches(test_medium): | |
return False | |
return True | |
def __str__(self): | |
return ' and '.join(map(str, self.children)) | |
class Or(Grouping): | |
def matches(self, test_medium): | |
for child in self.children: | |
if child.matches(test_medium): | |
return True | |
return False | |
def __str__(self): | |
return ', '.join(map(str, self.children)) | |
class Not(Grouping): | |
def matches(self, test_medium): | |
return not self.children[0].matches(test_medium) | |
def __str__(self): | |
return 'not {query}'.format(query=self.children[0]) | |
class Assignment(object): | |
def __init__(self, assignment): | |
pass | |
tokens = { | |
r'only': Whitespace, | |
r'not': Not, | |
r',': Or, | |
r'and': And, | |
r'\(': Grouping, | |
r'\)': Grouping, | |
r' ': Whitespace, | |
r'\t': Whitespace, | |
r'\n': Whitespace, | |
r':': Assignment, | |
r'all': MediaType, | |
r'aural': MediaType, | |
r'braille': MediaType, | |
r'handheld': MediaType, | |
r'print': MediaType, | |
r'projection': MediaType, | |
r'screen': MediaType, | |
r'tty': MediaType, | |
r'tv': MediaType, | |
r'embossed': MediaType, | |
r'width': MinMaxMediaFeature, | |
r'min-width': MinMaxMediaFeature, | |
r'max-width': MinMaxMediaFeature, | |
r'height': MinMaxMediaFeature, | |
r'min-height': MinMaxMediaFeature, | |
r'max-height': MinMaxMediaFeature, | |
r'device-width': MinMaxMediaFeature, | |
r'min-device-width': MinMaxMediaFeature, | |
r'max-device-width': MinMaxMediaFeature, | |
r'device-height': MinMaxMediaFeature, | |
r'min-device-height': MinMaxMediaFeature, | |
r'max-device-height': MinMaxMediaFeature, | |
r'aspect-ratio': AspectRatioMediaFeature, | |
r'min-aspect-ratio': AspectRatioMediaFeature, | |
r'max-aspect-ratio': AspectRatioMediaFeature, | |
r'device-aspect-ratio': AspectRatioMediaFeature, | |
r'min-device-aspect-ratio': AspectRatioMediaFeature, | |
r'max-device-aspect-ratio': AspectRatioMediaFeature, | |
r'color': MinMaxMediaFeature, | |
r'min-color': MinMaxMediaFeature, | |
r'max-color': MinMaxMediaFeature, | |
r'color-index': MinMaxMediaFeature, | |
r'min-color-index': MinMaxMediaFeature, | |
r'max-color': MinMaxMediaFeature, | |
r'monochrome': MinMaxMediaFeature, | |
r'min-monochrome': MinMaxMediaFeature, | |
r'max-monochrome': MinMaxMediaFeature, | |
r'resolution': ResolutionMediaFeature, | |
r'min-resolution': ResolutionMediaFeature, | |
r'max-resolution': ResolutionMediaFeature, | |
r'scan': ScanMediaFeature, | |
r'grid': MediaFeature, | |
r'orientation': OrientationFeature, | |
r'landscape': Value, | |
r'portrait': Value, | |
} | |
token_string = '({or_})'.format(or_='|'.join(tokens.keys())) | |
root = current = Grouping(None) | |
expect_value = False | |
for token in re.split(token_string, media_query, flags=re.I): | |
token = token.lower() | |
if not token: | |
continue | |
if token in tokens: | |
cls = tokens[token] | |
elif token in '()': | |
cls = Grouping | |
elif expect_value: | |
cls = Value | |
else: | |
# unknown rule | |
cls = UnknownMediaFeature | |
logger.warn('Unsupported media rule: {}'.format(repr(token))) | |
if expect_value and cls not in (Whitespace, Value): | |
raise ValueError(u'Expected value instead of {} in {}'.format(repr(token), repr(media_query))) | |
if cls == Whitespace: | |
pass | |
elif cls == Grouping: | |
if token == ')': | |
current = current.parent | |
else: | |
current = Grouping(parent=current) | |
current.parent.children.append(current) | |
elif cls in (And, Or): | |
current.parent.children.remove(current) | |
current = cls(current.parent, current) | |
current.parent.children.append(current) | |
elif issubclass(cls, MediaExpression): | |
current = cls(current, token) | |
current.parent.children.append(current) | |
if ( | |
isinstance(current.parent, Grouping) | |
and current.parent.__class__ != Grouping | |
): | |
# Close AND / OR / NOT groupings. | |
current = current.parent.parent | |
elif cls == Assignment: | |
assert isinstance(current, MediaFeature), 'Malformed query {}'.format(repr(media_query)) | |
expect_value = True | |
elif cls == Value: | |
if not expect_value: | |
raise ValueError(u'Unexpected %r in %r' % (token, media_query)) | |
current.set_value(token) | |
expect_value = False | |
return root | |
def media_query_matches(media_query, display_medium): | |
query = parse_media_query(media_query) | |
return query.matches(display_medium) | |
def parse_rules(rules, display_medium): | |
result = [] | |
media_rules = rules.rulesOfType( | |
cssutils.css.CSSRule.MEDIA_RULE | |
) | |
style_rules = rules.rulesOfType( | |
cssutils.css.CSSRule.STYLE_RULE | |
) | |
for rule in media_rules: | |
if media_query_matches(rule.media.mediaText, display_medium): | |
result.extend(parse_rules(rule.cssRules, display_medium)) | |
for rule in style_rules: | |
selectors = selectors_without_pseudo(rule) | |
xpaths_and_specificity = [] | |
for selector in selectors: | |
xpath = '{selector}'.format( | |
selector=CSSSelector(selector.selectorText).path | |
) | |
xpaths_and_specificity.append((xpath, selector.specificity)) | |
result.append((xpaths_and_specificity, rule.style.getProperties())) | |
return result | |
def parse_sheets(doc, doc_url, display_medium): | |
sheets_with_meta = {} | |
n = 0 | |
for style in doc.getroottree().xpath('//link|//style'): | |
if style.tag == 'link' and style.get('rel', '').lower() != 'stylesheet': | |
continue | |
if ( | |
style.tag == 'style' | |
and style.get('type') | |
and style.get('type') != 'text/css' | |
): | |
continue | |
embedded = False | |
media = style.get('media') | |
title = style.get('title') | |
if media and not media_query_matches(media, display_medium): | |
continue | |
if style.tag == 'style' and style.get('src'): | |
sheet = cssutils.parseUrl(full_path(style.get('src'), doc_url), | |
media=media, | |
title=title) | |
elif style.tag == 'style': | |
sheet = cssutils.parseString(style.text, | |
href=doc_url, | |
media=media, | |
title=title) | |
embedded = True | |
elif style.tag == 'link': | |
sheet = cssutils.parseUrl(full_path(style.get('href'), doc_url), | |
encoding=style.get('charset'), | |
media=media, | |
title=title) | |
n = _add_sheet(sheet, sheets_with_meta, n, embedded) | |
sheets = dict(sorted(sheets_with_meta.items(), | |
cmp=_compare_sheets_with_meta)).keys() | |
rules = [] | |
for sheet in sheets: | |
rules.extend(parse_rules(sheet.cssRules, display_medium)) | |
return rules | |
def apply_property_to_node(node, specificity, prop, current_style_state): | |
existing_prop = current_style_state.get(prop.name, None) | |
prop_specificity = (int(bool(prop.priority)),) + specificity | |
if existing_prop: | |
current_style_state[prop.name][prop_specificity] = prop | |
else: | |
current_style_state[prop.name] = {prop_specificity: prop} | |
def apply_properties_to_node(node, specificity, props, node_rules): | |
for prop in props: | |
apply_property_to_node(node, specificity, prop, node_rules) | |
_marker = object() | |
def finalize_style(node, node_rule_map, include_source=False): | |
original_inline_style = None | |
node_rules = node_rule_map.setdefault(node, OrderedDict()) | |
if node.get('style'): | |
original_inline_style = cssutils.parseStyle(node.get('style')) | |
del node.attrib['style'] | |
for prop in original_inline_style.getProperties(): | |
apply_property_to_node(node, (1, 0, 0, 0), prop, node_rules) | |
props = [] | |
for prop_specificity_values in node_rules.values(): | |
winning_prop = max(prop_specificity_values.keys()) | |
props.append(prop_specificity_values[winning_prop]) | |
if props: | |
style_text = '' | |
for prop in props: | |
css = prop.cssText | |
source = 'inline' | |
source_element = prop | |
while source_element and not isinstance(source_element, cssutils.css.CSSStyleSheet): | |
if getattr(source_element, 'parentStyleSheet', _marker) is not _marker: | |
source_element = source_element.parentStyleSheet | |
elif getattr(source_element, 'parent', _marker) is not _marker: | |
source_element = source_element.parent | |
else: | |
source_element = source_element.parentRule | |
if source_element: | |
source = source_element.href.rsplit('/', 1)[-1] | |
else: | |
source = 'inline' | |
style_text += '\n{css};{source}'.format( | |
css=css, | |
source=('/* {} */'.format(source) if include_source else '') | |
) | |
node.attrib['style'] = style_text | |
for child in node: | |
finalize_style(child, node_rule_map) | |
def apply_rules_to_node(node, rules): | |
node_rule_map = {} | |
tree = node.getroottree() | |
for xpaths_and_specificity, properties in rules: | |
for xpath, specificity in xpaths_and_specificity: | |
matches = tree.xpath(xpath) | |
for match in matches: | |
node_rules = node_rule_map.setdefault(match, OrderedDict()) | |
apply_properties_to_node(match, specificity, properties, | |
node_rules) | |
return node_rule_map | |
def remove_sheets(doc): | |
for style in doc.getroottree().xpath('//style|//link'): | |
style.getparent().remove(style) | |
def inline_css(html, doc_url, medium=None): | |
doc = soupparser.fromstring(html) | |
rules = parse_sheets(doc, doc_url, medium) | |
remove_sheets(doc) | |
node_rule_map = apply_rules_to_node(doc, rules) | |
finalize_style(doc, node_rule_map) | |
return BeautifulSoup(etree.tostring(doc), 'lxml').prettify() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment