Created
September 20, 2016 18:33
-
-
Save englehardt/8a80ba904c0d314dbf61d3cd07a2ce5a to your computer and use it in GitHub Desktop.
BlockListParser Utilities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This file contains a collection of utilities for working with BlockListParser | |
using http data, such as that collected by OpenWPM (https://github.com/citp/OpenWPM). | |
publicsuffix (https://pypi.python.org/pypi/publicsuffix/) is required | |
Example usage: | |
from publicsuffix import PublicSuffixList | |
from BlockListParser import BlockListParser | |
psl = PublicSuffixList() | |
easylist = BlockListParser('easylist.txt') | |
# Sample data | |
url = 'http://www.advertiser.com/ads/ad.js' | |
top_url = 'http://www.example.com' | |
content_type = 'application/javascript' | |
options = get_option_dict(url, top_url, | |
is_js(url, content_type), | |
is_image(url, content_type), | |
psl) | |
if easylist.should_block(url, options): | |
print "URL %s would have been blocked by easylist" % url | |
""" | |
from urlparse import urlparse | |
# Manual mapping created by examining the content types of responses on the | |
# top 1 million homepages in March 2016 | |
content_type_map = { | |
'script': lambda x: ( | |
'javascript' in x | |
or 'ecmascript' in x | |
or x.endswith('text/js') | |
), | |
'image': lambda x: ( | |
'image' in x | |
or 'img' in x | |
or 'jpg' in x | |
or 'jpeg' in x | |
or 'gif' in x | |
or 'png' in x | |
or 'ico' in x | |
), | |
'video': lambda x: ( | |
('video' in x | |
or 'movie' in x | |
or 'mp4' in x | |
or 'webm' in x) | |
and 'flv' not in x | |
), | |
'css': lambda x: 'css' in x, | |
'html': lambda x: 'html' in x, | |
'plain': lambda x: 'plain' in x and 'html' not in x, | |
'font': lambda x: 'font' in x or 'woff' in x, | |
'json': lambda x: 'json' in x, | |
'xml': lambda x: 'xml' in x and 'image' not in x, | |
'flash': lambda x: 'flash' in x or 'flv' in x or 'swf' in x, | |
'audio': lambda x: 'audio' in x, | |
'stream': lambda x: 'octet-stream' in x, | |
'form': lambda x: 'form' in x, | |
'binary': lambda x: 'binary' in x and 'image' not in x | |
} | |
IMAGE_TYPES = {'tif', 'tiff', 'gif', 'jpeg', | |
'jpg', 'jif', 'jfif', 'jp2', | |
'jpx', 'j2k', 'j2c', 'fpx', | |
'pcd', 'png'} | |
def get_top_level_type(content_type): | |
"""Returns a "top level" type for a given mimetype string. | |
This uses a manually compiled mapping of mime types. The top level types | |
returned are approximately mapped to request context types in Firefox | |
Parameters | |
---------- | |
content_type : str | |
content type string from the http response. | |
Returns | |
------- | |
str | |
"top level" content type, e.g. 'image' or 'script' | |
""" | |
if ';' in content_type: | |
content_type = content_type.split(';')[0] | |
for k,v in content_type_map.iteritems(): | |
if v(content_type.lower()): | |
return k | |
return None | |
def is_passive(content_type): | |
"""Checks if content is likely considered passive content. | |
Note that browsers block on *request* context, not response. For example, | |
the request generated from a <script> element will be classified as active | |
content. A custom mapping of response content types is used to determine | |
the likely classification, but this will be imperfect. Passive content as | |
defined here (ignoring <object> subresources): | |
https://developer.mozilla.org/en-US/docs/Security/Mixed_content | |
Parameters | |
---------- | |
content_type : str | |
content type string from the http response. | |
Returns | |
------- | |
bool | |
True if the content_type indicates passive content, false otherwise. | |
""" | |
return get_top_level_type(content_type) in ['image','audio','video'] | |
def is_active(content_type): | |
"""Checks if content is likely considered active content. | |
Note that browsers block on *request* context, not response. For example, | |
the request generated from a <script> element will be classified as active | |
content. A custom mapping of response content types is used to determine | |
the likely classification, but this will be imperfect. | |
Parameters | |
---------- | |
content_type : str | |
content type string from the http response. | |
Returns | |
------- | |
bool | |
True if the content_type indicates active content, false otherwise. | |
""" | |
return not is_passive(content_type) | |
def is_img(url, content_type): | |
"""Determine if a request url is an image. | |
Preference is given to the content type, but will fall back to the | |
extension of the url if necessary. | |
Parameters | |
---------- | |
url : str | |
request url | |
content_type : str | |
content type header of the http response to the request | |
Returns | |
------- | |
bool | |
True if the request is an image, false otherwise. | |
""" | |
if get_top_level_type(content_type) == 'image': | |
return True | |
extension = urlparse(url).path.split('.')[-1] | |
if extension.lower() in IMAGE_TYPES: | |
return True | |
return False | |
def is_js(url, content_type): | |
"""Determine if a request url is javascript. | |
Preference is given to the content type, but will fall back to the | |
extension of the url if necessary. | |
Parameters | |
---------- | |
url : str | |
request url | |
content_type : str | |
content type header of the http response to the request | |
Returns | |
------- | |
bool | |
True if the request is a JS file, false otherwise. | |
""" | |
if get_top_level_type(content_type) == 'script': | |
return True | |
if urlparse(url).path.split('.')[-1].lower() == 'js': | |
return True | |
return False | |
def get_option_dict(url, top_url, is_js, is_image, public_suffix_list): | |
"""Build an options dict for BlockListParser | |
Parameters | |
---------- | |
url : str | |
request url to be checked by BlockListParser | |
top_url : str | |
url of the top-level page the request is occuring on | |
is_js : bool | |
indicates if this request is js | |
is_image : bool | |
indicates if this request is an image | |
public_suffix_list : PublicSuffixList | |
An instance of PublicSuffixList() | |
Returns | |
------- | |
dict | |
An "options" dictionary for use with BlockListParser | |
""" | |
options = {} | |
options["image"] = is_image | |
options["script"] = is_js | |
options["third-party"] = False | |
options["domain"] = "" | |
options["top_url"] = top_url | |
top_hostname = urlparse(top_url).hostname | |
hostname = urlparse(url).hostname | |
top_domain = public_suffix_list.get_public_suffix(top_hostname) | |
domain = public_suffix_list.get_public_suffix(hostname) | |
if not top_domain == domain: | |
options["third-party"] = True | |
options["domain"] = top_hostname | |
return options |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment