#!/usr/bin/env python # # Downloader for Google Web Fonts # # For usage information run with "--help" # # Works on Python 2.6 and later, 3 and later # Requires tinycss (and argparse for Python 2.6) from pip # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # Copyright 2012 Kevin Locke <kevin@kevinlocke.name> from __future__ import with_statement import argparse import collections import contextlib import errno import gzip import io import itertools import logging import os import re import shutil import sys import tinycss try: from httplib import HTTPConnection, HTTPSConnection except ImportError: from http.client import HTTPConnection, HTTPSConnection try: import urlparse except ImportError: import urllib.parse as urlparse VERSION=(0,1,0) # ADT for font positional command-line arguments FontArgument = collections.namedtuple("FontArgument", ["family", "variants"]) # ADT for download information DownloadInfo = collections.namedtuple("DownloadItem", ["url", "filename"]) # Default HTTP User-Agent string default_user_agent = \ "DL4GoogleWebFonts/" + ".".join(str(x) for x in VERSION) # Mapping from font format to file extension fontfmt_extensions = { "embedded-opentype": "eot", "opentype": "ttf", "svg": "svg", "truetype": "ttf", "woff": "woff", } # Mapping from font format to User-Agent string required to get the format fontfmt_user_agent = { # EOT is served to IE 8- # IE 8 on Windows 7 "embedded-opentype": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)", # SVG is served to Safari Mobile 3-4 # Safari 3 on iPhone "svg": "Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543 Safari/419.3", # TTF is served to Android 4, Opera 11.01-, Safari Mobile 5+, non-Mobile Safari # Safari 6 on iPad "truetype": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10", # WOFF is served to Chrome, Firefox, Opera 11.10+ # Firefox 15 on Ubuntu "woff": "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1", } # Font formats which require separate requests for each variant # FIXME: This is probably UA-dependent. Switch to non-separate where possible fontfmt_serialize = frozenset(["embedded-opentype", "svg"]) def setup_logging(): """Initialize the global logger variable to a root logger for the console""" global logger formatter = logging.Formatter("%(levelname)s: %(message)s") handler = logging.StreamHandler() handler.setFormatter(formatter) logger = logging.getLogger() logger.addHandler(handler) setup_logging() # TODO: Replace with urllib3/Request (want to limit dependencies...) class ConnectionPool(object): """A very simple and naive connection pool for HTTP/HTTPS""" _http_connections = {} _https_connections = {} def close(self, proto, host): if proto == "http": conn = self._http_connections.pop(host, None) elif proto == "https": conn = self._https_connections.pop(host, None) else: raise ValueError("Unsupported protocol") if conn: conn.close() def close_all(self): conns = itertools.chain( self._http_connections.values(), self._https_connections.values() ) for conn in conns: conn.close() self._http_connections.clear() self._https_connections.clear() def get(self, proto, host): if proto == "http": if host not in self._http_connections: self._http_connections[host] = HTTPConnection(host) return self._http_connections[host] elif proto == "https": if host not in self._https_connections: self._https_connections[host] = HTTPSConnection(host) return self._https_connections[host] else: raise ValueError("Unsupported protocol") # Shared global connection pool connection_pool = ConnectionPool() class FontFaceRule(object): """A parsed at-rule for declaring a font-face.""" def __init__(self, at_keyword, declarations, line, column): self.at_keyword = at_keyword self.declarations = declarations self.line = line self.column = column class CSSFontFace3Parser(tinycss.css21.CSS21Parser): """A CSS parser which recognizes @font-face rules.""" def parse_at_rule(self, rule, previous_rules, errors, context): if rule.at_keyword == "@font-face": if rule.head: raise tinycss.css21.ParseError(rule.head[0], "Unexpected token {0} in {1} rule header".format( rule.head[0].type, rule.at_keyword)) declarations, body_errors = self.parse_declaration_list(rule.body) errors.extend(body_errors) return FontFaceRule(rule.at_keyword, declarations, rule.line, rule.column) return super(CSSFontFace3Parser, self).parse_at_rule(rule, previous_rules, errors, context) # FIXME: Should return HTTPResponse wrapper which handles decoding def decode_response(response): """Returns a file-like object of the content data in an HTTPResponse""" encoding = response.getheader("Content-Encoding") if encoding == "gzip": if sys.version_info < (3,2): gzipdata = io.BytesIO(response.read()) responsedata = gzip.GzipFile(fileobj=gzipdata) else: responsedata = gzip.GzipFile(fileobj=response) elif encoding == "identity" or not encoding: responsedata = response else: raise RuntimeError("Server used unsupported content encoding '{0}'".format(encoding)) return responsedata def download_file(url, filename): """ Downloads a given URL and save it with a given filename if that file does not exist """ logger.info("Downloading '{0}' as '{1}'".format(url, filename)) urlparts = urlparse.urlsplit(url) urlpath = urlparts.path if urlparts.query: urlpath += "?" + urlparts.query conn = connection_pool.get(urlparts.scheme, urlparts.netloc) headers = { "Accept-Encoding": "gzip", "Connection": "keep-alive", "User-Agent": default_user_agent, } conn.request(method="GET", url=urlpath, headers=headers) response = conn.getresponse() if response.status != 200: logger.error("Server returned status {0} ({1}) for {2}".format( response.status, response.reason, url)) return False else: logger.debug("Server returned status {0} ({1}) for {2}".format( response.status, response.reason, url)) responsedata = decode_response(response) try: fd = os.open(filename, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0666) except OSError as e: if e.errno == errno.EEXIST: logger.warn("File '{0}' already exists, skipping".format(filename)) else: logger.error("Unable to open output file '{0}': {1}".format(filename, str(e)), exc_info=True) return False try: with os.fdopen(fd, "wb") as outfile: shutil.copyfileobj(responsedata, outfile) except Exception as e: logger.error("Error downloading {0}: {1}".format(url, str(e)), exc_info=True) return False return True def choose_font_name(names): """Choose the "best" filename for a font from a given set""" safe = [n for n in names if "/" not in n] remaining = [n for n in names if " " not in n] if len(remaining) == 0: # Try other heuristics remaining = safe if len(remaining) > 1: # Return the longest choice = reduce(lambda m,n: m if len(m) > len(n) else n, remaining) else: choice = remaining[0] logger.debug("Chose name '{0}' from {1}".format(choice, names)) return choice def extract_font_names(srctokens): """Returns any local font names from a list of CSS tokens""" names = [] for token in srctokens: if token.type == "FUNCTION" and token.function_name == "local": # Content can be quoted or unquoted if len(token.content) == 1 and token.content[0].type == "STRING": names.append(token.content[0].value) else: names.append("".join([c.as_css() for c in token.content])) return names def extract_font_urls(fontfmt, srctokens): """Returns any URLs matching a given format in a given list of CSS tokens""" url = None # Last URL token parsed (cleared by non-S token) urls = [] # All URLs parsed for token in srctokens: if token.type == "URI": url = token.value elif token.type == "FUNCTION" and token.function_name == "format": if not url: logger.warn("CSS warning: Ignoring format() without associated url()") else: # CSS3 spec says format can be list of format strings # FIXME: Should warn about non-STRING tokens and 0 STRINGs urlfontfmts = [t.value for t in token.content if t.type == "STRING"] if fontfmt in urlfontfmts: urls.append(url) else: logger.debug("Ignoring URL {0} with format({1}) while fetching format({2})".format( url, "".join([c.as_css() for c in token.content]), fontfmt)) url = None elif token.type == "FUNCTION" and token.function_name == "local": # Ignore local name here pass elif token.type == "S": # Ignore space pass else: if url: logger.debug("Ignoring URL without format(): {0}".format(url)) url = None if token.type != "DELIM" or token.value != ",": logger.warn("CSS warning: Ignoring unexpected token {0}".format(token)) return urls def extract_font_downloads(fontfmt, rule): """Returns any font downloads for the specified format in the given CSS rule""" names = [] urls = [] for declaration in rule.declarations: if declaration.name == "src": names.extend(extract_font_names(declaration.value)) urls.extend(extract_font_urls(fontfmt, declaration.value)) if urls: if not names: name = urls[0].rsplit("/", 1)[-1].rsplit(".", 1)[0] logger.warn("No name found for {0}, using name from URL".format(urls[0])) else: name = choose_font_name(names) # Ensure urls are unique urls = set(urls) if len(urls) > 1: logger.warn("Ignoring additional URLs for same format: {0}".format(urls)) fontfmt_ext = fontfmt_extensions[fontfmt] url = urls.pop() ext = urlparse.urlsplit(url).path.rsplit(".", 1)[-1] if "/" in ext: logger.debug("No extension for '{0}', using '{1}' from format".format(url, fontfmt_ext)) ext = fontfmt_ext elif ext != fontfmt_ext: logger.warn("URL extension '{0}' does not match format extension '{1}'".format(ext, fontfmt_ext)) downloads = [ DownloadInfo(url=url,filename=name+"."+ext) ] else: logger.warn("Ignoring @font-face without src") downloads = [] return downloads def fetch_fonts_from_css(fontfmt, stylesheet): """Downloads any fonts for the given format in the given CSS stylesheet""" downloads=[] haveff = False for rule in stylesheet.rules: if rule.at_keyword == "@font-face": haveff = True downloads.extend(extract_font_downloads(fontfmt, rule)) downloadcnt = 0 if not haveff: logger.warn("No @font-face rules found in stylesheet") else: for download in downloads: if download_file(download.url, download.filename): downloadcnt += 1 return downloadcnt def make_css_path(subsets, fonts): """Returns the path to a CSS file for the given subsets and fonts""" url = "/css?family=" families = [] for font in fonts: family = font.family.replace(" ", "+") if font.variants: family += ":" + ",".join(font.variants) families.append(family) url += "|".join(families) if subsets: url += "&subset=" + ",".join(subsets) return url def fetch_css(fontfmt, subsets, fonts): """Downloads CSS files with the given formats, subsets, and fonts""" path = make_css_path(subsets, fonts) user_agent = fontfmt_user_agent[fontfmt] headers = { "Accept": "text/css", "Accept-Encoding": "gzip", "Connection": "keep-alive", "User-Agent": user_agent, } logger.info("Downloading {0} for {1} format".format(path, fontfmt)) conn = connection_pool.get("http", "fonts.googleapis.com") conn.request(method="GET", url=path, headers=headers) return conn.getresponse() def parse_css(response): """Converts a CSS HTTPResponse into a tinycss stylesheet""" content_type = response.getheader("Content-Type", "text/css") css_charset_re = "text/css\s*;\s*charset\s*=\s*([^\s;]+)\s*(?:;|$)" css_charset_match = re.match(css_charset_re, content_type, re.I) charset = css_charset_match.group(1) if css_charset_match else None parser = tinycss.make_parser(CSSFontFace3Parser) cssdata = decode_response(response) logger.debug("Parsing CSS response with charset '{0}'".format(charset)) return parser.parse_stylesheet_bytes(cssdata.read(), charset) def fetch_fonts_format(fontfmt, subsets, fonts): """Downloads font files for a given format, subsets, and fonts""" response = fetch_css(fontfmt, subsets, fonts) if response.status != 200: logger.error("Server returned status {0} ({1}) for CSS file. Incorrect font name?".format( response.status, response.reason)) # Need to empty response before sending next request response.read() return 0 stylesheet = parse_css(response) return fetch_fonts_from_css(fontfmt, stylesheet) def fetch_fonts(fontfmts, subsets, fonts): """Downloads font files for the given formats, subsets, and fonts""" downloadcnt = 0 for fontfmt in fontfmts: if fontfmt in fontfmt_serialize: for i in itertools.count(): # A list of fonts with variant i of their list fonts1v = [] for font in fonts: # Note: Include empty variant on first pass, if empty if len(font.variants) > i or (i == 0 and len(font.variants) == 0): fonts1v.append(font._replace(variants=font.variants[i:i+1])) if not fonts1v: # All variants of all fonts have been fetched break downloadcnt += fetch_fonts_format(fontfmt, subsets, fonts1v) else: downloadcnt += fetch_fonts_format(fontfmt, subsets, fonts) return downloadcnt def parse_font_arg(arg): """Parses a command-line argument into a FontArgument""" if ":" in arg: family, variants = arg.split(":", 1) if "," in variants: variants = variants.split(",") else: variants = [ variants ] else: family = arg variants = [] return FontArgument(family=family, variants=variants) def main(*argv): parser = argparse.ArgumentParser(description="Download Google Web Fonts") parser.add_argument( '-f', '--format', action="append", help="Format to download (may appear multiple times)", choices=sorted(fontfmt_user_agent.keys())) parser.add_argument( '-q', '--quiet', action="count", help="Decrease verbosity (make quieter)") parser.add_argument( '-s', '--subset', action="append", help="Subset to download (may appear multiple times)") parser.add_argument( '-v', '--verbose', action="count", help="Increase verbosity") parser.add_argument( '-V', '--version', action="version", version="%(prog)s " + ".".join(str(x) for x in VERSION)) parser.add_argument( 'font', nargs="+", help="Font to download (in same format as CSS URL)", type=parse_font_arg) args = parser.parse_args(args=argv[1:]) # By default, download all formats if not args.format: args.format = fontfmt_user_agent.keys() # Set log level based on verbosity requested (default of INFO) verbosity = (args.quiet or 0) - (args.verbose or 0) logger.setLevel(logging.INFO + verbosity * 10) try: fetched = fetch_fonts(frozenset(args.format), args.subset, args.font) logger.info("Finished downloading {0} font files".format(fetched)) return 0 except Exception as e: logger.error("Unexpected internal error: {0}".format(str(e)), exc_info=True) return 1 finally: try: connection_pool.close_all() except Exception as e: pass if __name__ == "__main__": sys.exit(main(*sys.argv))