Skip to content

Instantly share code, notes, and snippets.

@mitchellrj
Last active May 4, 2016 09:38
Show Gist options
  • Save mitchellrj/8472092 to your computer and use it in GitHub Desktop.
Save mitchellrj/8472092 to your computer and use it in GitHub Desktop.
CSS inliner
"""
Take an HTML document and replace all CSS with inline styles, accounting
for all precedence rules. Requires cssutils, cssselect and lxml.Does not
work with pseudo-elements, @font-face, @page and CSS variables as these
cannot be represented by inline CSS.
Usage::
inline_css(html_string, page_url, medium)
Where ``medium`` is of the type::
{
'type': 'screen',
'width': '1024',
'height': '768',
'orientation': 'landscape',
'device-width': '2048',
'device-height': '1024',
'aspect-ratio': '4/3',
'device-aspect-ratio': '2/1',
'color': '24',
'color-index': '256',
'monochrome': '0',
'grid': False,
'scan': '',
'resolution': '300dpi',
'pixel-ratio': '1'
}
"""
from collections import OrderedDict
import fractions
import logging
import re
import urlparse
try:
from bs4 import BeautifulSoup
except ImportError:
from BeautifulSoup import BeautifulSoup
import cssutils
from lxml import etree
from lxml.cssselect import CSSSelector
from lxml.html import soupparser
logger = logging.getLogger(__name__)
def _compare_sheets_with_meta(sheet1, sheet2):
if sheet1[1]['embedded'] is True and sheet2[1]['embedded'] is False:
return 1
elif sheet1[1]['embedded'] is False and sheet2[1]['embedded'] is True:
return -1
return cmp(sheet1[1]['order'], sheet2[1]['order'])
def full_path(url, doc_url):
base_url = doc_url.rsplit('/', 1)[0]
if not urlparse.urlparse(url).netloc:
return urlparse.urljoin(base_url, url)
return url
def _add_sheet(sheet, sheets_with_meta, n, embedded):
import_rules = sheet.cssRules.rulesOfType(cssutils.css.CSSRule.IMPORT_RULE)
for rule in import_rules:
subsheet = rule.styleSheet
n = _add_sheet(subsheet, sheets_with_meta, n, False)
sheets_with_meta[sheet] = {'order': n, 'embedded': embedded}
n += 1
return n
def selectors_without_pseudo(style_rule):
non_pseudo_selectors = []
for selector in style_rule.selectorList:
if ':' not in selector.selectorText:
non_pseudo_selectors.append(selector)
return non_pseudo_selectors
def parse_media_query(media_query):
class Grouping(object):
def __init__(self, parent, *args):
self.parent = parent
self.children = list(args)
def add(self, arg):
self.children.add(arg)
def matches(self, test_medium):
return self.children[0].matches(test_medium)
def __str__(self):
if not self.children:
return ''
return '({child})'.format(child=self.children[0])
class MediaExpression(object):
pass
class MediaType(MediaExpression):
def __init__(self, parent, value):
self.parent = parent
self.value = value
def matches(self, test_medium):
return self.value == 'all' or self.value == test_medium['type']
def __str__(self):
return self.value
class MediaFeature(MediaExpression):
def __init__(self, parent, name, value=None):
self.parent = parent
self.name = name
self.set_value(value)
def set_value(self, value):
if value is None:
self.value = None
self.units = None
else:
self.value = self.convert(value)
self.units = self.get_units(value)
def get_units(self, value):
return re.sub(r'[0-9]', '', value.lower().strip())
def convert(self, value):
return int(re.sub(r'[^0-9]', '', value.lower().strip()))
def matches(self, test_medium):
if self.value is None:
return bool(test_medium[self.name])
return self.value == self.convert(test_medium[self.name])
def __str__(self):
if self.value is None:
return self.name
return '{name}: {value}{units}'.format(
name=self.name,
value=self.value,
units=(self.units or '')
)
class OrientationFeature(MediaFeature):
def __init__(self, parent, value):
self.parent = parent
self.value = value
def set_value(self, value):
if value is None:
self.value = None
elif value.lower() in ('portrait', 'landscape'):
self.value = value.lower()
else:
self.value = None
self.units = None
def matches(self, test_medium):
if (self.value, test_medium['orientation']) == (None, None):
return True
return self.value.lower() == test_medium['orientation'].lower()
def __str__(self):
return self.value
class ScanMediaFeature(MediaFeature):
def get_units(self, value):
return None
def convert(self, value):
return value
class MinMaxMediaFeature(MediaFeature):
def matches(self, test_medium):
if self.name.startswith('min-'):
return self <= self.convert(test_medium[self.name[4:]])
elif self.name.startswith('max-'):
return self >= self.convert(test_medium[self.name[4:]])
else:
return super(MediaFeature, self).matches(test_medium)
def __ge__(self, test_value):
return self.value >= test_value
def __le__(self, test_value):
return self.value <= test_value
def __eq__(self, test_value):
return self.value == test_value
class UnknownMediaFeature(MediaFeature):
def convert(self, value):
return value
def set_value(self, value):
self.value = value
self.units = None
def matches(self, test_medium):
return True
class AspectRatioMediaFeature(MinMaxMediaFeature):
def convert(self, value):
width, height = value.strip().split('/', 1)
return int(width) / int(height)
def get_units(self, value):
return None
def __str__(self):
fraction = fractions.Fraction(self.value)
return '{name}: {value}'.format(
name=self.name,
value=fraction,
)
class ResolutionMediaFeature(MinMaxMediaFeature):
CM_IN_INCH = 2.54
def convert(self, value):
int_value = super(MinMaxMediaFeature, self).convert(value)
if self.get_units(value) == 'dpcm':
int_value = int_value * self.CM_IN_INCH
return int_value
class Whitespace(object):
def __init__(self, whitespace):
pass
class Value(object):
def __init__(self, value):
pass
class And(Grouping):
def matches(self, test_medium):
for child in self.children:
if not child.matches(test_medium):
return False
return True
def __str__(self):
return ' and '.join(map(str, self.children))
class Or(Grouping):
def matches(self, test_medium):
for child in self.children:
if child.matches(test_medium):
return True
return False
def __str__(self):
return ', '.join(map(str, self.children))
class Not(Grouping):
def matches(self, test_medium):
return not self.children[0].matches(test_medium)
def __str__(self):
return 'not {query}'.format(query=self.children[0])
class Assignment(object):
def __init__(self, assignment):
pass
tokens = {
r'only': Whitespace,
r'not': Not,
r',': Or,
r'and': And,
r'\(': Grouping,
r'\)': Grouping,
r' ': Whitespace,
r'\t': Whitespace,
r'\n': Whitespace,
r':': Assignment,
r'all': MediaType,
r'aural': MediaType,
r'braille': MediaType,
r'handheld': MediaType,
r'print': MediaType,
r'projection': MediaType,
r'screen': MediaType,
r'tty': MediaType,
r'tv': MediaType,
r'embossed': MediaType,
r'width': MinMaxMediaFeature,
r'min-width': MinMaxMediaFeature,
r'max-width': MinMaxMediaFeature,
r'height': MinMaxMediaFeature,
r'min-height': MinMaxMediaFeature,
r'max-height': MinMaxMediaFeature,
r'device-width': MinMaxMediaFeature,
r'min-device-width': MinMaxMediaFeature,
r'max-device-width': MinMaxMediaFeature,
r'device-height': MinMaxMediaFeature,
r'min-device-height': MinMaxMediaFeature,
r'max-device-height': MinMaxMediaFeature,
r'aspect-ratio': AspectRatioMediaFeature,
r'min-aspect-ratio': AspectRatioMediaFeature,
r'max-aspect-ratio': AspectRatioMediaFeature,
r'device-aspect-ratio': AspectRatioMediaFeature,
r'min-device-aspect-ratio': AspectRatioMediaFeature,
r'max-device-aspect-ratio': AspectRatioMediaFeature,
r'color': MinMaxMediaFeature,
r'min-color': MinMaxMediaFeature,
r'max-color': MinMaxMediaFeature,
r'color-index': MinMaxMediaFeature,
r'min-color-index': MinMaxMediaFeature,
r'max-color': MinMaxMediaFeature,
r'monochrome': MinMaxMediaFeature,
r'min-monochrome': MinMaxMediaFeature,
r'max-monochrome': MinMaxMediaFeature,
r'resolution': ResolutionMediaFeature,
r'min-resolution': ResolutionMediaFeature,
r'max-resolution': ResolutionMediaFeature,
r'scan': ScanMediaFeature,
r'grid': MediaFeature,
r'orientation': OrientationFeature,
r'landscape': Value,
r'portrait': Value,
}
token_string = '({or_})'.format(or_='|'.join(tokens.keys()))
root = current = Grouping(None)
expect_value = False
for token in re.split(token_string, media_query, flags=re.I):
token = token.lower()
if not token:
continue
if token in tokens:
cls = tokens[token]
elif token in '()':
cls = Grouping
elif expect_value:
cls = Value
else:
# unknown rule
cls = UnknownMediaFeature
logger.warn('Unsupported media rule: {}'.format(repr(token)))
if expect_value and cls not in (Whitespace, Value):
raise ValueError(u'Expected value instead of {} in {}'.format(repr(token), repr(media_query)))
if cls == Whitespace:
pass
elif cls == Grouping:
if token == ')':
current = current.parent
else:
current = Grouping(parent=current)
current.parent.children.append(current)
elif cls in (And, Or):
current.parent.children.remove(current)
current = cls(current.parent, current)
current.parent.children.append(current)
elif issubclass(cls, MediaExpression):
current = cls(current, token)
current.parent.children.append(current)
if (
isinstance(current.parent, Grouping)
and current.parent.__class__ != Grouping
):
# Close AND / OR / NOT groupings.
current = current.parent.parent
elif cls == Assignment:
assert isinstance(current, MediaFeature), 'Malformed query {}'.format(repr(media_query))
expect_value = True
elif cls == Value:
if not expect_value:
raise ValueError(u'Unexpected %r in %r' % (token, media_query))
current.set_value(token)
expect_value = False
return root
def media_query_matches(media_query, display_medium):
query = parse_media_query(media_query)
return query.matches(display_medium)
def parse_rules(rules, display_medium):
result = []
media_rules = rules.rulesOfType(
cssutils.css.CSSRule.MEDIA_RULE
)
style_rules = rules.rulesOfType(
cssutils.css.CSSRule.STYLE_RULE
)
for rule in media_rules:
if media_query_matches(rule.media.mediaText, display_medium):
result.extend(parse_rules(rule.cssRules, display_medium))
for rule in style_rules:
selectors = selectors_without_pseudo(rule)
xpaths_and_specificity = []
for selector in selectors:
xpath = '{selector}'.format(
selector=CSSSelector(selector.selectorText).path
)
xpaths_and_specificity.append((xpath, selector.specificity))
result.append((xpaths_and_specificity, rule.style.getProperties()))
return result
def parse_sheets(doc, doc_url, display_medium):
sheets_with_meta = {}
n = 0
for style in doc.getroottree().xpath('//link|//style'):
if style.tag == 'link' and style.get('rel', '').lower() != 'stylesheet':
continue
if (
style.tag == 'style'
and style.get('type')
and style.get('type') != 'text/css'
):
continue
embedded = False
media = style.get('media')
title = style.get('title')
if media and not media_query_matches(media, display_medium):
continue
if style.tag == 'style' and style.get('src'):
sheet = cssutils.parseUrl(full_path(style.get('src'), doc_url),
media=media,
title=title)
elif style.tag == 'style':
sheet = cssutils.parseString(style.text,
href=doc_url,
media=media,
title=title)
embedded = True
elif style.tag == 'link':
sheet = cssutils.parseUrl(full_path(style.get('href'), doc_url),
encoding=style.get('charset'),
media=media,
title=title)
n = _add_sheet(sheet, sheets_with_meta, n, embedded)
sheets = dict(sorted(sheets_with_meta.items(),
cmp=_compare_sheets_with_meta)).keys()
rules = []
for sheet in sheets:
rules.extend(parse_rules(sheet.cssRules, display_medium))
return rules
def apply_property_to_node(node, specificity, prop, current_style_state):
existing_prop = current_style_state.get(prop.name, None)
prop_specificity = (int(bool(prop.priority)),) + specificity
if existing_prop:
current_style_state[prop.name][prop_specificity] = prop
else:
current_style_state[prop.name] = {prop_specificity: prop}
def apply_properties_to_node(node, specificity, props, node_rules):
for prop in props:
apply_property_to_node(node, specificity, prop, node_rules)
_marker = object()
def finalize_style(node, node_rule_map, include_source=False):
original_inline_style = None
node_rules = node_rule_map.setdefault(node, OrderedDict())
if node.get('style'):
original_inline_style = cssutils.parseStyle(node.get('style'))
del node.attrib['style']
for prop in original_inline_style.getProperties():
apply_property_to_node(node, (1, 0, 0, 0), prop, node_rules)
props = []
for prop_specificity_values in node_rules.values():
winning_prop = max(prop_specificity_values.keys())
props.append(prop_specificity_values[winning_prop])
if props:
style_text = ''
for prop in props:
css = prop.cssText
source = 'inline'
source_element = prop
while source_element and not isinstance(source_element, cssutils.css.CSSStyleSheet):
if getattr(source_element, 'parentStyleSheet', _marker) is not _marker:
source_element = source_element.parentStyleSheet
elif getattr(source_element, 'parent', _marker) is not _marker:
source_element = source_element.parent
else:
source_element = source_element.parentRule
if source_element:
source = source_element.href.rsplit('/', 1)[-1]
else:
source = 'inline'
style_text += '\n{css};{source}'.format(
css=css,
source=('/* {} */'.format(source) if include_source else '')
)
node.attrib['style'] = style_text
for child in node:
finalize_style(child, node_rule_map)
def apply_rules_to_node(node, rules):
node_rule_map = {}
tree = node.getroottree()
for xpaths_and_specificity, properties in rules:
for xpath, specificity in xpaths_and_specificity:
matches = tree.xpath(xpath)
for match in matches:
node_rules = node_rule_map.setdefault(match, OrderedDict())
apply_properties_to_node(match, specificity, properties,
node_rules)
return node_rule_map
def remove_sheets(doc):
for style in doc.getroottree().xpath('//style|//link'):
style.getparent().remove(style)
def inline_css(html, doc_url, medium=None):
doc = soupparser.fromstring(html)
rules = parse_sheets(doc, doc_url, medium)
remove_sheets(doc)
node_rule_map = apply_rules_to_node(doc, rules)
finalize_style(doc, node_rule_map)
return BeautifulSoup(etree.tostring(doc), 'lxml').prettify()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment