Skip to content

Instantly share code, notes, and snippets.

@erikaris
Last active February 20, 2017 01:59
Show Gist options
  • Save erikaris/652f251826aae011ae6899372a749aff to your computer and use it in GitHub Desktop.
Save erikaris/652f251826aae011ae6899372a749aff to your computer and use it in GitHub Desktop.
PhantomJS vs PyQt4 Crawler

PhantomJS vs PyQt4 Crawler

PhantomJS usage

phantomjs crawl.js [URI] [OutDir]

PyQt4 usage

python crawl.py [URI] [OutDir]

var fs = require('fs');
var page = require('webpage').create();
var networkResources = {}
// Import md5 from CryptoJS to hash URI
phantom.injectJs('md5.js')
// Import underscore.js to make array unique
phantom.injectJs('underscore.js')
// Set start time
var starttime = Date.now()
// If number of arguments after crawl.js is not 2, show message and exit phantomjs
if (phantom.args.length != 2) {
console.log('Usage: phantomjs crawl.js <URI> <outputDir>');
phantom.exit(1);
}
// Else, continue opening URI
else {
// use 1st param after crawl.js as URL input and 2nd param as output
url = phantom.args[0];
outputDir = phantom.args[1];
// Set timeout on fetching resources to 10 seconds (can be changed)
page.settings.resourceTimeout = 10000;
page.onResourceTimeout = function(e) {
console.log('Resource', e.url, 'timeout.', e.errorCode, e.errorString);
};
// Use browser size 1024x768 (to be used on screenshot)
page.viewportSize = { width: 1024, height: 768 };
// Resource is similiar with all listed in developer tools -> network tab -> refresh
page.onResourceReceived = function (res) {
resUrl = res.url;
console.log('Resource received', resUrl);
// Save all network resources to variable
// res are sometimes duplicated, so only pushed if array hasnt contains this value
// use underscore.js to check whether value has been contained in networkResources key
headers = {}
res.headers.forEach(function(header) {
headers[header['name']] = header['value'];
});
var resource = {
'url' : resUrl,
'status_code' : res.status,
'content_type' : res.contentType,
'headers' : headers,
}
var networkResourcesKeys = Object.keys(networkResources);
if(! _.contains(networkResourcesKeys, resUrl)) {
networkResources[resUrl] = resource;
}
};
// Open URI
page.open(url, function (status) {
if (status !== 'success') {
console.log('Unable to load the url', url);
phantom.exit(1);
} else {
// After page is opened, process page.
// Use setTimeout to delay process
// Timeout in ms, means 200 ms
window.setTimeout(function () {
processPage(url, outputDir);
// Set finished time
var finishtime = Date.now()
// Show message that crawl finished, and calculate executing time
console.log('Crawl finished in', (finishtime - starttime), 'miliseconds');
phantom.exit();
}, 200);
}
});
}
function processPage(url, outputDir) {
var hashedUrl = md5(url);
// Save screenshot
var screenshotFile = outputDir + '/screenshot/' + hashedUrl + '.png';
page.render(screenshotFile);
console.log('Screenshot is saved in', screenshotFile)
// Save html using fs.write
// DOM selection or modification always be done inside page.evaluate
var htmlFile = outputDir + '/html/' + hashedUrl + '.html';
var html = page.evaluate(function() {
return document.body.parentElement.outerHTML;
});
fs.write(htmlFile, html, "w");
console.log('HTML source of page is saved in', htmlFile)
// Save all resources
// networkResources are sometimes duplicated
// use filter as described in http://stackoverflow.com/questions/1960473/unique-values-in-an-array
var resourceFile = outputDir + '/log/' + hashedUrl + '.log';
var networkResourcesKeys = Object.keys(networkResources);
for(r=0; r<networkResourcesKeys.length; r++) {
var value = networkResources[networkResourcesKeys[r]];
fs.write(resourceFile, JSON.stringify(value) + '\n', "a");
}
console.log('Network resources is saved in', resourceFile)
processImages(url, outputDir);
processCsses(url, outputDir);
}
function processImages(url, outputDir) {
var hashedUrl = md5(url);
// Get images using document.images
// document.images also can be execute in browser console
var images = page.evaluate(function () {
var documentImages = document.images;
var allImages = [];
for(var i=0; i<documentImages.length; i++) {
var docImage = documentImages[i];
// Calculate top left position
var obj = docImage;
var curleft = 0, curtop = 0;
if (obj.offsetParent) {
do {
curleft += obj.offsetLeft;
curtop += obj.offsetTop;
} while (obj = obj.offsetParent);
}
// Create json containing url and rectangle
var jsonImage = {
'url' : docImage['src'],
'rectangle' : {
'width' : docImage['width'],
'height' : docImage['height'],
'top' : curtop,
'left' : curleft,
},
};
// Append to all images
allImages.push(jsonImage);
}
return allImages;
});
// Check images url == resource url, append position if same
var networkImages = []
var networkResourcesKeys = Object.keys(networkResources);
for(var i=0; i<images.length; i++) {
var image = images[i];
idx = _.indexOf(networkResourcesKeys, image['url']);
if(idx >= 0) {
var networkImage = networkResources[networkResourcesKeys[idx]];
if('rectangles' in networkImage) {
networkImage['rectangles'].push(image['rectangle'])
} else {
networkImage['rectangles'] = []
}
networkImages.push(networkImage);
}
}
// Save all resource images
var resourceImageFile = outputDir + '/log/' + hashedUrl + '.img.log';
var networkImagesKeys = Object.keys(networkImages);
for(r=0; r<networkImagesKeys.length; r++) {
var value = networkImages[networkImagesKeys[r]];
fs.write(resourceImageFile, JSON.stringify(value) + '\n', "a");
}
console.log('Network resource images is saved in', resourceImageFile)
}
function processCsses(url, outputDir) {
var hashedUrl = md5(url);
var csses = page.evaluate(function () {
// Get all stylesheets. This command also can be run in browser console document.styleSheets
var documentCsses = document.styleSheets;
var allCsses = []
for(var c=0; c<documentCsses.length; c++) {
var docCss = documentCsses[c];
// For each stylesheet, get its rules
var rules = docCss.cssRules || [];
// For each rule, get selectorText
var rules_tag = []
for(var r=0; r<rules.length; r++) {
var rule = rules[r].selectorText;
rules_tag.push(rule);
}
// Create json containing url and rules
var jsonCss = {
'url' : docCss['href'] || '[INTERNAL]',
'rules_tag' : rules_tag,
};
allCsses.push(jsonCss);
}
return allCsses;
});
// Check css url == resource url, append position if same
var networkCsses = []
var networkResourcesKeys = Object.keys(networkResources);
for(var i=0; i<csses.length; i++) {
var css = csses[i];
idx = _.indexOf(networkResourcesKeys, css['url']);
if(idx >= 0) {
var networkCss = networkResources[networkResourcesKeys[idx]];
css = _.extend(css, networkCss);
}
if('rules_tag' in css) {
var importance = 0;
for(var r=0; r<css['rules_tag'].length; r++) {
var rule = css['rules_tag'][r];
importance += calculateImportance(rule);
}
css['importance'] = importance;
networkCsses.push(css);
}
}
// Save all resource csses
var resourceCssFile = outputDir + '/log/' + hashedUrl + '.css.log';
for(r=0; r<networkCsses.length; r++) {
fs.write(resourceCssFile, JSON.stringify(networkCsses[r]) + '\n', "a");
}
console.log('Network resource csses is saved in', resourceCssFile)
}
function calculateImportance(rule) {
var importance = 0;
if(rule == undefined) {
} else if(rule.match(/^\..*/i)) {
importance += page.evaluate(getNumElementsByClass, rule);
} else if(rule.match(/^#.*/i)) {
var theArr = rule.split('#');
var theArr2 = theArr[1].split(' ');
var theGuy = theArr2[0];
importance += page.evaluate(getNumElementByID, theGuy);
} else if(rule.match(/.*#.*/i)) {
importance += page.evaluate(getNumElementByID, rule);
} else if(rule.match(/[a-zA-Z]*\..*/g)) {
var theArr = rule.split('.');
importance += page.evaluate(getNumElementsByTagAndClass, theArr[0], theArr[1]);
} else if(!(rule.match(/\./ig))) {
importance += page.evaluate(getNumElementsByTag, rule);
} else {
}
return importance;
}
function getNumElementsByClass(className) {
var counter = 0;
var elems = document.getElementsByTagName('*');
for (var i = 0; i < elems.length; i++) {
if((' ' + elems[i].className + ' ').indexOf(' ' + className + ' ') > -1) {
counter++;
}
}
return counter;
}
function getNumElementByID(id) {
var theThing = document.getElementById(id);
if(theThing == null)
return 0;
return 1;
}
function getNumElementsByTagAndClass(tagName, className) {
var counter = 0;
var elems = document.getElementsByTagName(tagName);
for (var i = 0; i < elems.length; i++) {
if((' ' + elems[i].className + ' ').indexOf(' ' + className + ' ') > -1) {
counter++;
}
}
return counter;
}
function getNumElementsByTag(tagName) {
return document.getElementsByTagName(tagName).length;
}
import json
from functools import partial
from urlparse import urlparse
import errno
from PyQt4.QtCore import QObject, SIGNAL, QUrl, QVariant
from PyQt4.QtGui import QImage, QPainter
from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkRequest
from PyQt4.QtWebKit import QWebView, QWebPage, QWebSettings
from datetime import datetime
'''
Section 'Crawl' ==============================================================
'''
def variant_to_json(variant):
if variant.type() == QVariant.Map:
obj = {}
for k,v in variant.toMap().items():
obj[unicode(k)] = variant_to_json(v)
return obj
if variant.type() == QVariant.List:
lst = []
for v in variant.toList():
lst.append(variant_to_json(v))
return lst
if variant.type() == QVariant.String:
return str(variant.toString())
if variant.type() == QVariant.Int:
return int(variant.toString())
if variant.type() == QVariant.Double:
return float(variant.toString())
if variant.type() == QVariant.Bool:
return bool(variant.toBool())
return unicode(variant.toString())
class CrawlNetwork(QNetworkAccessManager):
contentTypeHeader = QNetworkRequest.ContentTypeHeader
def __init__(self, web, logger):
QNetworkAccessManager.__init__(self)
QObject.connect(self, SIGNAL("finished(QNetworkReply *)"),
self.finished)
self.web = web
self.logger = logger
def finished(self, response):
url = unicode(response.request().url().toString())
base_url = unicode(self.web.page().mainFrame().baseUrl().toString())
blocked = False
for bl in self.web.blacklists:
in_bl = str(bl) in str(url)
blocked = blocked or in_bl
headers = {}
for header in response.rawHeaderList():
headers[unicode(header.data())] = response.rawHeader(header).data()
url_domain = '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse(url))
base_url_domain = '{uri.scheme}://{uri.netloc}/'.format(
uri=urlparse(base_url))
resource = {
'url' : url,
'content_type' : unicode(response.header(
self.contentTypeHeader).toString()),
'headers' : headers,
'error_code' : response.error(),
'is_local' : url_domain == base_url_domain,
'is_blocked' : blocked
}
self.logger(resource)
class CrawlBrowser(QWebView):
resources = []
def __init__(self):
QWebView.__init__(self)
self.settings().setAttribute(QWebSettings.JavascriptEnabled, True)
def get_resources(self, output_file=None):
if output_file:
with open(output_file, 'wb') as f:
print 'saving resources log to', output_file
f.write('\n'.join([json.dumps(stat) for stat in
self.resources]))
return self.resources
def make_directory_recursive(self, file):
dir = os.path.dirname(os.path.realpath(file))
try:
os.makedirs(dir)
except OSError, e:
if e.errno != errno.EEXIST:
raise
def take_screenshot(self, output_file):
# set to webpage size
self.page().setViewportSize(self.page().mainFrame().contentsSize())
# render image
image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
self.page().mainFrame().render(painter)
painter.end()
# create directory of file
self.make_directory_recursive(output_file)
# save image
print 'saving screenshot to', output_file
image.save(output_file)
def get_html(self, output_file=None):
html = self.page().mainFrame().toHtml()
html = unicode(html).encode('utf-8')
if output_file:
# create directory of file
self.make_directory_recursive(output_file)
print 'saving html source to', output_file
with open(output_file, 'wb') as f:
f.write(html)
return html
def get_images(self, output_file=None):
imgs = {}
res_urls = [res['url'] for res in self.resources]
document = self.page().mainFrame().documentElement()
for img in document.findAll('img'):
for idx, res_url in enumerate(res_urls):
if res_url.endswith(str(img.attribute('src'))):
if res_url not in imgs:
imgs[res_url] = self.resources[idx]
imgs[res_url].setdefault('rectangles', [])
imgs[res_url]['rectangles'].append({
'left' : img.geometry().x(),
'top' : img.geometry().y(),
'width' : img.geometry().width(),
'height' : img.geometry().height()
})
if output_file:
# create directory of file
self.make_directory_recursive(output_file)
with open(output_file, 'wb') as f:
print 'saving images log to', output_file
f.write('\n'.join([json.dumps(img) for url, img in
imgs.items()]))
return imgs.values()
def get_stylesheets(self, output_file=None):
jsFn = """
function getNumElementsByClass(className) {
var counter = 0;
var elems = document.getElementsByTagName('*');
for (var i = 0; i < elems.length; i++) {
if((' ' + elems[i].className + ' ').indexOf(' ' + className + ' ') > -1) {
counter++;
}
}
return counter;
}
function getNumElementByID(id) {
var theThing = document.getElementById(id);
if(theThing == null)
return 0;
return 1;
}
function getNumElementsByTagAndClass(tagName, className) {
var counter = 0;
var elems = document.getElementsByTagName(tagName);
for (var i = 0; i < elems.length; i++) {
if((' ' + elems[i].className + ' ').indexOf(' ' + className + ' ') > -1) {
counter++;
}
}
return counter;
}
function getNumElementsByTag(tagName) {
return document.getElementsByTagName(tagName).length;
}
function cssRules() {
var styles = document.styleSheets;
var allRules = {};
for(var s=0; s<styles.length; s++) {
var style = styles[s];
var rules = [];
var selectors = []
if(style.cssRules) {
rules = style.cssRules;
} else if(style.rules) {
rules = style.rules;
}
for(var i=0; i < rules.length; i++) {
selectors.push(rules[i].selectorText);
}
if(style.href != null)
allRules[style.href] = selectors;
else allRules['[INTERNAL]'] = selectors;
}
var allRulesImportance = {}
for(url in allRules) {
var props = allRules[url];
var importance = 0;
for(var p=0; p<props.length; p++) {
var prop = props[p];
if(prop == undefined) {
continue;
} else if(prop.match(/^\..*/i)) {
importance += getNumElementsByClass(prop);
} else if(prop.match(/^#.*/i)) {
var theArr = prop.split('#');
var theArr2 = theArr[1].split(' ');
var theGuy = theArr2[0];
importance += getNumElementByID(theGuy);
} else if(prop.match(/.*#.*/i)) {
importance += getNumElementByID(prop);
} else if(prop.match(/[a-zA-Z]*\..*/g)) {
var theArr = prop.split('.');
importance += getNumElementsByTagAndClass(theArr[0], theArr[1]);
} else if(!(prop.match(/\./ig))) {
importance += getNumElementsByTag(prop);
} else {
}
}
allRulesImportance[url] = {
//'src' : url,
'rules_tag' : props,
'importance' : importance
}
if(url=='[INTERNAL]')
allRulesImportance[url]['url'] = url
}
return allRulesImportance;
}
cssRules();
"""
res_urls = [res['url'] for res in self.resources]
stylesheets_importance = self.page().mainFrame().evaluateJavaScript(jsFn)
stylesheets_importance = variant_to_json(stylesheets_importance)
if output_file:
# create directory of file
self.make_directory_recursive(output_file)
with open(output_file, 'wb') as f:
for url, css in stylesheets_importance.items():
if '[INTERNAL]' in url:
f.write(json.dumps(css) + '\n')
csses = []
for idx, res_url in enumerate(res_urls):
if res_url.endswith(url):
css.update(self.resources[idx])
csses.append(css)
print 'saving stylesheets log to', output_file
f.write('\n'.join([json.dumps(css) for css in enumerate(
csses)]))
return stylesheets_importance.values()
class CrawlListener:
def on_start(self, id, browser):
pass
def on_loaded(self, id, browser):
pass
def on_resource_received(self, log, id, *browser):
pass
def on_finished(self, sessions):
pass
class Crawler(object):
sessions = dict()
browsers = dict()
blacklists = []
def __init__(self, app, listener=CrawlListener()):
self.app = app
self.listener = listener
def set_blacklists(self, blacklists):
self.blacklists = blacklists
def add_blacklist(self, blacklist):
self.blacklists.append(blacklist)
def browser_started(self, id, sess_id):
browser, url, status = self.browsers[id]
self.listener.on_start(id, browser)
def browser_loaded(self, id, sess_id, is_loaded):
if is_loaded:
browser, url, status = self.browsers[id]
if not status:
self.browsers[id] = (browser, url, True)
self.listener.on_loaded(id, browser)
if all([status for browser, url, status in self.browsers.values()]):
print('Session {} finished'.format(sess_id))
urls = [url for browser, url, status in self.browsers.values()]
self.sessions[sess_id] = (sess_id, urls, True)
if all([status for id, urls, status in self.sessions.values()]):
print('All sessions finished')
self.listener.on_finished(self.sessions.values())
self.app.quit()
if all([status for browser, url, status in self.browsers.values()]):
try:
self.start_session(self.sessions[sess_id+1])
except:
pass
def browser_resources_received(self, browser_id, session_id, log):
self.browsers[browser_id][0].resources.append(log)
browser, url, status = self.browsers[browser_id]
self.listener.on_resource_received(log, browser_id, browser)
# if not log['is_blocked']:
# self.resources.append(log)
def start_session(self, session):
sess_id, urls, is_processed = session
print('Session {} started'.format(sess_id))
for id, url in enumerate(urls):
browser = CrawlBrowser()
browser.blacklists = self.blacklists
browser.setPage(QWebPage())
network_fn = partial(self.browser_resources_received, id, sess_id)
browser.page().setNetworkAccessManager(
CrawlNetwork(browser, network_fn))
started_fn = partial(self.browser_started, id, sess_id)
browser.loadStarted.connect(started_fn)
loaded_fn = partial(self.browser_loaded, id, sess_id)
browser.loadFinished.connect(loaded_fn)
self.browsers[id] = (browser, url, False)
for id, (browser, url, status) in self.browsers.items():
browser.load(QUrl(url))
def start(self, urls):
if isinstance(urls, basestring) or isinstance(urls, unicode):
urls = [urls,]
urls = list(set(urls))
sessions_urls = [urls[x:x+10] for x in xrange(0, len(urls), 10)]
for id, session_urls in enumerate(sessions_urls):
self.sessions[id] = (id, session_urls, False)
print('Processing {} urls in {} sessions'.format(len(urls),
len(self.sessions)))
self.start_session(self.sessions[0])
'''
Section 'Damage' =============================================================
'''
class SiteDamage:
image_importance = 0
def __init__(self):
pass
def calculate_images_importance(self, str_images_log, page_size=(0,0),
size_weight=0.5, centrality_weight=0.5):
page_width, page_height = page_size
for log in str_images_log:
log = json.loads(log)
for image_rect in log['rectangles']:
location_importance = 0
# Based on measureMemento.pl line 703
if image_rect['left'] + image_rect['width'] > page_width/2 \
and image_rect['left'] < page_width/2:
location_importance += centrality_weight/2
# Based on measureMemento.pl line 715
if image_rect['top'] + image_rect['height'] > page_height/2 \
and image_rect['top'] < page_height/2:
location_importance += centrality_weight/2
# Based on measureMemento.pl line 729
img_size_to_page_size_prop = (image_rect['width'] * \
image_rect['height']) / \
(page_width * page_height)
size_importance = img_size_to_page_size_prop * size_weight
self.image_importance = size_importance + location_importance
if __name__ == "__main__":
import sys
import os
from hashlib import md5
from PyQt4.QtGui import QApplication
if len(sys.argv) > 0:
if len(sys.argv) != 3:
print('Usage :')
print('python crawl.py <url> <output_dir>')
exit()
url = sys.argv[1]
output_dir = sys.argv[2]
class CustomCrawlListener(CrawlListener):
def on_start(self, id, browser):
print('Browser {} is starting crawl {}\n\n'
.format(id, browser.page().mainFrame().requestedUrl()))
self.timestart = datetime.now()
def on_loaded(self, id, browser):
url = str(browser.page().mainFrame().requestedUrl().toString())
hashed_url = md5(url).hexdigest()
browser.get_html('{}.html'.format(
os.path.join(output_dir, 'html', hashed_url)))
browser.take_screenshot('{}.png'.format(
os.path.join(output_dir, 'screenshot', hashed_url)))
browser.get_images('{}.img.log'.format(
os.path.join(output_dir, 'log', hashed_url)))
browser.get_stylesheets('{}.css.log'.format(
os.path.join(output_dir, 'log', hashed_url)))
browser.get_resources('{}.log'.format(
os.path.join(output_dir, 'log', hashed_url)))
print('Browser {} is finished crawl {}\n\n'
.format(id, url))
def on_resource_received(self, log, id, *browser):
print('Browser {} receive resource {}\n\n'
.format(id, log['url']))
def on_finished(self, sessions):
self.timefinish = datetime.now()
process_time = (self.timefinish -
self.timestart)
urls = [len(urls) for id, urls, status in sessions]
print('All Finished\n\n')
print('{} URIs has crawled in {} sessions in {} '
'miliseconds\n\n'.format(sum(urls), len(urls),
process_time))
app = QApplication([])
crawler = Crawler(app, CustomCrawlListener())
crawler.add_blacklist('http://web.archive.org/static')
crawler.start(url)
app.exec_()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment