Skip to content

Instantly share code, notes, and snippets.

@akx
Created February 3, 2012 08:55

Revisions

  1. akx created this gist Feb 3, 2012.
    101 changes: 101 additions & 0 deletions squeeze.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,101 @@
    # -- encoding: utf-8 --
    """ A fairly advanced HTML/HTML5 compressor. """

    from __future__ import with_statement
    import re
    from itertools import chain

    HTML5_BLOCKS = set(
    'head body title link '
    'article aside nav section '
    'h1 h2 h3 h4 h5 h6 hgroup p'.strip().split()
    )
    HTML5_BLOCKS_RE_FRAG = "(%s)" % ("|".join(re.escape(b) for b in sorted(HTML5_BLOCKS)))
    STRAY_LEAD_WS_RE = re.compile(r"^\s+", re.MULTILINE)
    STRAY_TRAIL_WS_RE = re.compile(r"\s+$", re.MULTILINE)
    QUOTED_PARAM_RE = re.compile(r'([a-z]+)=\"([-._:/a-z0-9]+)\"', re.I)
    XHTML_CLOSE_RE = re.compile(r'\s+/>')
    WS_BEFORE_BLOCK_START = re.compile(r'\s+(<%s>)' % HTML5_BLOCKS_RE_FRAG, re.I)
    WS_AFTER_BLOCK_START = re.compile(r'(<%s>)\s+' % HTML5_BLOCKS_RE_FRAG, re.I)
    WS_BEFORE_BLOCK_END = re.compile(r'\s+(</%s>)' % HTML5_BLOCKS_RE_FRAG, re.I)
    WS_AFTER_BLOCK_END = re.compile(r'(</%s>)\s+' % HTML5_BLOCKS_RE_FRAG, re.I)
    NEWLINE_RE = re.compile(r'[\r\n]+')
    EMPTY_PARAM_RE = re.compile(r'(rel|id|class)=\"\s*\"')
    NEWLINE_SEP_PARAM_RE = re.compile(r'[\r\n]+(\w+=[\"\'])')
    CLEAN_SIMPLE_TAG_RE = re.compile(r'<\s*(\w+)\s*>')
    SIMPLE_TEXT_TAG_RE = re.compile(r'<(?P<tag>a)(?P<params>.*)>(?P<content>[\w\s&;\n]+)</(?P=tag)>', re.UNICODE)


    def unquote(match):
    """ Internal: Unquote an HTML attribute. In cases such as defer="defer", turn them into just defer. """
    name = match.group(1)
    val = match.group(2)
    if name == val:
    return name
    return "%s=%s" % (name, val)


    # List of conservative replacements.
    conservativeReps = (
    ( # Clean line-starting and line-ending whitespaces.
    (STRAY_TRAIL_WS_RE, STRAY_LEAD_WS_RE),
    ""
    ),
    ( # Clean whitespaces before and after block starts and ends.
    (WS_BEFORE_BLOCK_START, WS_BEFORE_BLOCK_END, WS_AFTER_BLOCK_END, WS_AFTER_BLOCK_START),
    r'\1'
    ),
    )

    # List of drastic replacements.
    drasticReps = (
    ( # Turn XHTML closing tags (/>) into HTML closing tags (>)
    (XHTML_CLOSE_RE, ),
    '>'
    ),
    ( # Remove empty parameters
    (EMPTY_PARAM_RE, ),
    ""
    ),
    ( # ensure parameters are delimited by spaces
    (NEWLINE_SEP_PARAM_RE, ),
    r" \1"
    ),
    ( # Unquote HTML parameters that are safe to unquote.
    (QUOTED_PARAM_RE, ),
    unquote
    ),
    ( # ensure simple tags (after empty-params) don't have spaces in them
    (CLEAN_SIMPLE_TAG_RE, ),
    r"<\1>"
    ),
    ( # ensure <a href="ccc"> a </a> turns into something saner
    (SIMPLE_TEXT_TAG_RE, ),
    lambda m: "<%s%s>%s</%s>" % (m.group("tag"), m.group("params"), m.group("content").strip(), m.group("tag"))
    ),

    )


    def squeeze_html(html, conservative=False):
    """ Squeeze every single spare byte out of the given HTML. If 'conservative' is set, attempt to retain XHTML compliance. """
    reps = (conservativeReps if conservative else chain(conservativeReps, drasticReps))
    for rs, rep in reps:
    for r in rs:
    html = r.sub(rep, html)
    html = NEWLINE_RE.sub("\n", html).strip()
    return html

    def cmdline():
    import argparse
    ap = argparse.ArgumentParser("squeeze")
    ap.add_argument("files", nargs="+")
    ap.add_argument("--conservative", "-c", default = False, action="store_true")
    opts = ap.parse_args()
    for filename in opts.files:
    with file(filename, "rb") as in_file:
    print squeeze_html(in_file.read(), bool(opts.conservative))


    if __name__ == '__main__':
    cmdline()