phantomjs crawl.js [URI] [OutDir]
python crawl.py [URI] [OutDir]
| var fs = require('fs'); | |
| var page = require('webpage').create(); | |
| var networkResources = {} | |
| // Import md5 from CryptoJS to hash URI | |
| phantom.injectJs('md5.js') | |
| // Import underscore.js to make array unique | |
| phantom.injectJs('underscore.js') | |
| // Set start time | |
| var starttime = Date.now() | |
| // If number of arguments after crawl.js is not 2, show message and exit phantomjs | |
| if (phantom.args.length != 2) { | |
| console.log('Usage: phantomjs crawl.js <URI> <outputDir>'); | |
| phantom.exit(1); | |
| } | |
| // Else, continue opening URI | |
| else { | |
| // use 1st param after crawl.js as URL input and 2nd param as output | |
| url = phantom.args[0]; | |
| outputDir = phantom.args[1]; | |
| // Set timeout on fetching resources to 10 seconds (can be changed) | |
| page.settings.resourceTimeout = 10000; | |
| page.onResourceTimeout = function(e) { | |
| console.log('Resource', e.url, 'timeout.', e.errorCode, e.errorString); | |
| }; | |
| // Use browser size 1024x768 (to be used on screenshot) | |
| page.viewportSize = { width: 1024, height: 768 }; | |
| // Resource is similiar with all listed in developer tools -> network tab -> refresh | |
| page.onResourceReceived = function (res) { | |
| resUrl = res.url; | |
| console.log('Resource received', resUrl); | |
| // Save all network resources to variable | |
| // res are sometimes duplicated, so only pushed if array hasnt contains this value | |
| // use underscore.js to check whether value has been contained in networkResources key | |
| headers = {} | |
| res.headers.forEach(function(header) { | |
| headers[header['name']] = header['value']; | |
| }); | |
| var resource = { | |
| 'url' : resUrl, | |
| 'status_code' : res.status, | |
| 'content_type' : res.contentType, | |
| 'headers' : headers, | |
| } | |
| var networkResourcesKeys = Object.keys(networkResources); | |
| if(! _.contains(networkResourcesKeys, resUrl)) { | |
| networkResources[resUrl] = resource; | |
| } | |
| }; | |
| // Open URI | |
| page.open(url, function (status) { | |
| if (status !== 'success') { | |
| console.log('Unable to load the url', url); | |
| phantom.exit(1); | |
| } else { | |
| // After page is opened, process page. | |
| // Use setTimeout to delay process | |
| // Timeout in ms, means 200 ms | |
| window.setTimeout(function () { | |
| processPage(url, outputDir); | |
| // Set finished time | |
| var finishtime = Date.now() | |
| // Show message that crawl finished, and calculate executing time | |
| console.log('Crawl finished in', (finishtime - starttime), 'miliseconds'); | |
| phantom.exit(); | |
| }, 200); | |
| } | |
| }); | |
| } | |
| function processPage(url, outputDir) { | |
| var hashedUrl = md5(url); | |
| // Save screenshot | |
| var screenshotFile = outputDir + '/screenshot/' + hashedUrl + '.png'; | |
| page.render(screenshotFile); | |
| console.log('Screenshot is saved in', screenshotFile) | |
| // Save html using fs.write | |
| // DOM selection or modification always be done inside page.evaluate | |
| var htmlFile = outputDir + '/html/' + hashedUrl + '.html'; | |
| var html = page.evaluate(function() { | |
| return document.body.parentElement.outerHTML; | |
| }); | |
| fs.write(htmlFile, html, "w"); | |
| console.log('HTML source of page is saved in', htmlFile) | |
| // Save all resources | |
| // networkResources are sometimes duplicated | |
| // use filter as described in http://stackoverflow.com/questions/1960473/unique-values-in-an-array | |
| var resourceFile = outputDir + '/log/' + hashedUrl + '.log'; | |
| var networkResourcesKeys = Object.keys(networkResources); | |
| for(r=0; r<networkResourcesKeys.length; r++) { | |
| var value = networkResources[networkResourcesKeys[r]]; | |
| fs.write(resourceFile, JSON.stringify(value) + '\n', "a"); | |
| } | |
| console.log('Network resources is saved in', resourceFile) | |
| processImages(url, outputDir); | |
| processCsses(url, outputDir); | |
| } | |
| function processImages(url, outputDir) { | |
| var hashedUrl = md5(url); | |
| // Get images using document.images | |
| // document.images also can be execute in browser console | |
| var images = page.evaluate(function () { | |
| var documentImages = document.images; | |
| var allImages = []; | |
| for(var i=0; i<documentImages.length; i++) { | |
| var docImage = documentImages[i]; | |
| // Calculate top left position | |
| var obj = docImage; | |
| var curleft = 0, curtop = 0; | |
| if (obj.offsetParent) { | |
| do { | |
| curleft += obj.offsetLeft; | |
| curtop += obj.offsetTop; | |
| } while (obj = obj.offsetParent); | |
| } | |
| // Create json containing url and rectangle | |
| var jsonImage = { | |
| 'url' : docImage['src'], | |
| 'rectangle' : { | |
| 'width' : docImage['width'], | |
| 'height' : docImage['height'], | |
| 'top' : curtop, | |
| 'left' : curleft, | |
| }, | |
| }; | |
| // Append to all images | |
| allImages.push(jsonImage); | |
| } | |
| return allImages; | |
| }); | |
| // Check images url == resource url, append position if same | |
| var networkImages = [] | |
| var networkResourcesKeys = Object.keys(networkResources); | |
| for(var i=0; i<images.length; i++) { | |
| var image = images[i]; | |
| idx = _.indexOf(networkResourcesKeys, image['url']); | |
| if(idx >= 0) { | |
| var networkImage = networkResources[networkResourcesKeys[idx]]; | |
| if('rectangles' in networkImage) { | |
| networkImage['rectangles'].push(image['rectangle']) | |
| } else { | |
| networkImage['rectangles'] = [] | |
| } | |
| networkImages.push(networkImage); | |
| } | |
| } | |
| // Save all resource images | |
| var resourceImageFile = outputDir + '/log/' + hashedUrl + '.img.log'; | |
| var networkImagesKeys = Object.keys(networkImages); | |
| for(r=0; r<networkImagesKeys.length; r++) { | |
| var value = networkImages[networkImagesKeys[r]]; | |
| fs.write(resourceImageFile, JSON.stringify(value) + '\n', "a"); | |
| } | |
| console.log('Network resource images is saved in', resourceImageFile) | |
| } | |
| function processCsses(url, outputDir) { | |
| var hashedUrl = md5(url); | |
| var csses = page.evaluate(function () { | |
| // Get all stylesheets. This command also can be run in browser console document.styleSheets | |
| var documentCsses = document.styleSheets; | |
| var allCsses = [] | |
| for(var c=0; c<documentCsses.length; c++) { | |
| var docCss = documentCsses[c]; | |
| // For each stylesheet, get its rules | |
| var rules = docCss.cssRules || []; | |
| // For each rule, get selectorText | |
| var rules_tag = [] | |
| for(var r=0; r<rules.length; r++) { | |
| var rule = rules[r].selectorText; | |
| rules_tag.push(rule); | |
| } | |
| // Create json containing url and rules | |
| var jsonCss = { | |
| 'url' : docCss['href'] || '[INTERNAL]', | |
| 'rules_tag' : rules_tag, | |
| }; | |
| allCsses.push(jsonCss); | |
| } | |
| return allCsses; | |
| }); | |
| // Check css url == resource url, append position if same | |
| var networkCsses = [] | |
| var networkResourcesKeys = Object.keys(networkResources); | |
| for(var i=0; i<csses.length; i++) { | |
| var css = csses[i]; | |
| idx = _.indexOf(networkResourcesKeys, css['url']); | |
| if(idx >= 0) { | |
| var networkCss = networkResources[networkResourcesKeys[idx]]; | |
| css = _.extend(css, networkCss); | |
| } | |
| if('rules_tag' in css) { | |
| var importance = 0; | |
| for(var r=0; r<css['rules_tag'].length; r++) { | |
| var rule = css['rules_tag'][r]; | |
| importance += calculateImportance(rule); | |
| } | |
| css['importance'] = importance; | |
| networkCsses.push(css); | |
| } | |
| } | |
| // Save all resource csses | |
| var resourceCssFile = outputDir + '/log/' + hashedUrl + '.css.log'; | |
| for(r=0; r<networkCsses.length; r++) { | |
| fs.write(resourceCssFile, JSON.stringify(networkCsses[r]) + '\n', "a"); | |
| } | |
| console.log('Network resource csses is saved in', resourceCssFile) | |
| } | |
| function calculateImportance(rule) { | |
| var importance = 0; | |
| if(rule == undefined) { | |
| } else if(rule.match(/^\..*/i)) { | |
| importance += page.evaluate(getNumElementsByClass, rule); | |
| } else if(rule.match(/^#.*/i)) { | |
| var theArr = rule.split('#'); | |
| var theArr2 = theArr[1].split(' '); | |
| var theGuy = theArr2[0]; | |
| importance += page.evaluate(getNumElementByID, theGuy); | |
| } else if(rule.match(/.*#.*/i)) { | |
| importance += page.evaluate(getNumElementByID, rule); | |
| } else if(rule.match(/[a-zA-Z]*\..*/g)) { | |
| var theArr = rule.split('.'); | |
| importance += page.evaluate(getNumElementsByTagAndClass, theArr[0], theArr[1]); | |
| } else if(!(rule.match(/\./ig))) { | |
| importance += page.evaluate(getNumElementsByTag, rule); | |
| } else { | |
| } | |
| return importance; | |
| } | |
| function getNumElementsByClass(className) { | |
| var counter = 0; | |
| var elems = document.getElementsByTagName('*'); | |
| for (var i = 0; i < elems.length; i++) { | |
| if((' ' + elems[i].className + ' ').indexOf(' ' + className + ' ') > -1) { | |
| counter++; | |
| } | |
| } | |
| return counter; | |
| } | |
| function getNumElementByID(id) { | |
| var theThing = document.getElementById(id); | |
| if(theThing == null) | |
| return 0; | |
| return 1; | |
| } | |
| function getNumElementsByTagAndClass(tagName, className) { | |
| var counter = 0; | |
| var elems = document.getElementsByTagName(tagName); | |
| for (var i = 0; i < elems.length; i++) { | |
| if((' ' + elems[i].className + ' ').indexOf(' ' + className + ' ') > -1) { | |
| counter++; | |
| } | |
| } | |
| return counter; | |
| } | |
| function getNumElementsByTag(tagName) { | |
| return document.getElementsByTagName(tagName).length; | |
| } |
| import json | |
| from functools import partial | |
| from urlparse import urlparse | |
| import errno | |
| from PyQt4.QtCore import QObject, SIGNAL, QUrl, QVariant | |
| from PyQt4.QtGui import QImage, QPainter | |
| from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkRequest | |
| from PyQt4.QtWebKit import QWebView, QWebPage, QWebSettings | |
| from datetime import datetime | |
| ''' | |
| Section 'Crawl' ============================================================== | |
| ''' | |
| def variant_to_json(variant): | |
| if variant.type() == QVariant.Map: | |
| obj = {} | |
| for k,v in variant.toMap().items(): | |
| obj[unicode(k)] = variant_to_json(v) | |
| return obj | |
| if variant.type() == QVariant.List: | |
| lst = [] | |
| for v in variant.toList(): | |
| lst.append(variant_to_json(v)) | |
| return lst | |
| if variant.type() == QVariant.String: | |
| return str(variant.toString()) | |
| if variant.type() == QVariant.Int: | |
| return int(variant.toString()) | |
| if variant.type() == QVariant.Double: | |
| return float(variant.toString()) | |
| if variant.type() == QVariant.Bool: | |
| return bool(variant.toBool()) | |
| return unicode(variant.toString()) | |
| class CrawlNetwork(QNetworkAccessManager): | |
| contentTypeHeader = QNetworkRequest.ContentTypeHeader | |
| def __init__(self, web, logger): | |
| QNetworkAccessManager.__init__(self) | |
| QObject.connect(self, SIGNAL("finished(QNetworkReply *)"), | |
| self.finished) | |
| self.web = web | |
| self.logger = logger | |
| def finished(self, response): | |
| url = unicode(response.request().url().toString()) | |
| base_url = unicode(self.web.page().mainFrame().baseUrl().toString()) | |
| blocked = False | |
| for bl in self.web.blacklists: | |
| in_bl = str(bl) in str(url) | |
| blocked = blocked or in_bl | |
| headers = {} | |
| for header in response.rawHeaderList(): | |
| headers[unicode(header.data())] = response.rawHeader(header).data() | |
| url_domain = '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse(url)) | |
| base_url_domain = '{uri.scheme}://{uri.netloc}/'.format( | |
| uri=urlparse(base_url)) | |
| resource = { | |
| 'url' : url, | |
| 'content_type' : unicode(response.header( | |
| self.contentTypeHeader).toString()), | |
| 'headers' : headers, | |
| 'error_code' : response.error(), | |
| 'is_local' : url_domain == base_url_domain, | |
| 'is_blocked' : blocked | |
| } | |
| self.logger(resource) | |
| class CrawlBrowser(QWebView): | |
| resources = [] | |
| def __init__(self): | |
| QWebView.__init__(self) | |
| self.settings().setAttribute(QWebSettings.JavascriptEnabled, True) | |
| def get_resources(self, output_file=None): | |
| if output_file: | |
| with open(output_file, 'wb') as f: | |
| print 'saving resources log to', output_file | |
| f.write('\n'.join([json.dumps(stat) for stat in | |
| self.resources])) | |
| return self.resources | |
| def make_directory_recursive(self, file): | |
| dir = os.path.dirname(os.path.realpath(file)) | |
| try: | |
| os.makedirs(dir) | |
| except OSError, e: | |
| if e.errno != errno.EEXIST: | |
| raise | |
| def take_screenshot(self, output_file): | |
| # set to webpage size | |
| self.page().setViewportSize(self.page().mainFrame().contentsSize()) | |
| # render image | |
| image = QImage(self.page().viewportSize(), QImage.Format_ARGB32) | |
| painter = QPainter(image) | |
| self.page().mainFrame().render(painter) | |
| painter.end() | |
| # create directory of file | |
| self.make_directory_recursive(output_file) | |
| # save image | |
| print 'saving screenshot to', output_file | |
| image.save(output_file) | |
| def get_html(self, output_file=None): | |
| html = self.page().mainFrame().toHtml() | |
| html = unicode(html).encode('utf-8') | |
| if output_file: | |
| # create directory of file | |
| self.make_directory_recursive(output_file) | |
| print 'saving html source to', output_file | |
| with open(output_file, 'wb') as f: | |
| f.write(html) | |
| return html | |
| def get_images(self, output_file=None): | |
| imgs = {} | |
| res_urls = [res['url'] for res in self.resources] | |
| document = self.page().mainFrame().documentElement() | |
| for img in document.findAll('img'): | |
| for idx, res_url in enumerate(res_urls): | |
| if res_url.endswith(str(img.attribute('src'))): | |
| if res_url not in imgs: | |
| imgs[res_url] = self.resources[idx] | |
| imgs[res_url].setdefault('rectangles', []) | |
| imgs[res_url]['rectangles'].append({ | |
| 'left' : img.geometry().x(), | |
| 'top' : img.geometry().y(), | |
| 'width' : img.geometry().width(), | |
| 'height' : img.geometry().height() | |
| }) | |
| if output_file: | |
| # create directory of file | |
| self.make_directory_recursive(output_file) | |
| with open(output_file, 'wb') as f: | |
| print 'saving images log to', output_file | |
| f.write('\n'.join([json.dumps(img) for url, img in | |
| imgs.items()])) | |
| return imgs.values() | |
| def get_stylesheets(self, output_file=None): | |
| jsFn = """ | |
| function getNumElementsByClass(className) { | |
| var counter = 0; | |
| var elems = document.getElementsByTagName('*'); | |
| for (var i = 0; i < elems.length; i++) { | |
| if((' ' + elems[i].className + ' ').indexOf(' ' + className + ' ') > -1) { | |
| counter++; | |
| } | |
| } | |
| return counter; | |
| } | |
| function getNumElementByID(id) { | |
| var theThing = document.getElementById(id); | |
| if(theThing == null) | |
| return 0; | |
| return 1; | |
| } | |
| function getNumElementsByTagAndClass(tagName, className) { | |
| var counter = 0; | |
| var elems = document.getElementsByTagName(tagName); | |
| for (var i = 0; i < elems.length; i++) { | |
| if((' ' + elems[i].className + ' ').indexOf(' ' + className + ' ') > -1) { | |
| counter++; | |
| } | |
| } | |
| return counter; | |
| } | |
| function getNumElementsByTag(tagName) { | |
| return document.getElementsByTagName(tagName).length; | |
| } | |
| function cssRules() { | |
| var styles = document.styleSheets; | |
| var allRules = {}; | |
| for(var s=0; s<styles.length; s++) { | |
| var style = styles[s]; | |
| var rules = []; | |
| var selectors = [] | |
| if(style.cssRules) { | |
| rules = style.cssRules; | |
| } else if(style.rules) { | |
| rules = style.rules; | |
| } | |
| for(var i=0; i < rules.length; i++) { | |
| selectors.push(rules[i].selectorText); | |
| } | |
| if(style.href != null) | |
| allRules[style.href] = selectors; | |
| else allRules['[INTERNAL]'] = selectors; | |
| } | |
| var allRulesImportance = {} | |
| for(url in allRules) { | |
| var props = allRules[url]; | |
| var importance = 0; | |
| for(var p=0; p<props.length; p++) { | |
| var prop = props[p]; | |
| if(prop == undefined) { | |
| continue; | |
| } else if(prop.match(/^\..*/i)) { | |
| importance += getNumElementsByClass(prop); | |
| } else if(prop.match(/^#.*/i)) { | |
| var theArr = prop.split('#'); | |
| var theArr2 = theArr[1].split(' '); | |
| var theGuy = theArr2[0]; | |
| importance += getNumElementByID(theGuy); | |
| } else if(prop.match(/.*#.*/i)) { | |
| importance += getNumElementByID(prop); | |
| } else if(prop.match(/[a-zA-Z]*\..*/g)) { | |
| var theArr = prop.split('.'); | |
| importance += getNumElementsByTagAndClass(theArr[0], theArr[1]); | |
| } else if(!(prop.match(/\./ig))) { | |
| importance += getNumElementsByTag(prop); | |
| } else { | |
| } | |
| } | |
| allRulesImportance[url] = { | |
| //'src' : url, | |
| 'rules_tag' : props, | |
| 'importance' : importance | |
| } | |
| if(url=='[INTERNAL]') | |
| allRulesImportance[url]['url'] = url | |
| } | |
| return allRulesImportance; | |
| } | |
| cssRules(); | |
| """ | |
| res_urls = [res['url'] for res in self.resources] | |
| stylesheets_importance = self.page().mainFrame().evaluateJavaScript(jsFn) | |
| stylesheets_importance = variant_to_json(stylesheets_importance) | |
| if output_file: | |
| # create directory of file | |
| self.make_directory_recursive(output_file) | |
| with open(output_file, 'wb') as f: | |
| for url, css in stylesheets_importance.items(): | |
| if '[INTERNAL]' in url: | |
| f.write(json.dumps(css) + '\n') | |
| csses = [] | |
| for idx, res_url in enumerate(res_urls): | |
| if res_url.endswith(url): | |
| css.update(self.resources[idx]) | |
| csses.append(css) | |
| print 'saving stylesheets log to', output_file | |
| f.write('\n'.join([json.dumps(css) for css in enumerate( | |
| csses)])) | |
| return stylesheets_importance.values() | |
| class CrawlListener: | |
| def on_start(self, id, browser): | |
| pass | |
| def on_loaded(self, id, browser): | |
| pass | |
| def on_resource_received(self, log, id, *browser): | |
| pass | |
| def on_finished(self, sessions): | |
| pass | |
| class Crawler(object): | |
| sessions = dict() | |
| browsers = dict() | |
| blacklists = [] | |
| def __init__(self, app, listener=CrawlListener()): | |
| self.app = app | |
| self.listener = listener | |
| def set_blacklists(self, blacklists): | |
| self.blacklists = blacklists | |
| def add_blacklist(self, blacklist): | |
| self.blacklists.append(blacklist) | |
| def browser_started(self, id, sess_id): | |
| browser, url, status = self.browsers[id] | |
| self.listener.on_start(id, browser) | |
| def browser_loaded(self, id, sess_id, is_loaded): | |
| if is_loaded: | |
| browser, url, status = self.browsers[id] | |
| if not status: | |
| self.browsers[id] = (browser, url, True) | |
| self.listener.on_loaded(id, browser) | |
| if all([status for browser, url, status in self.browsers.values()]): | |
| print('Session {} finished'.format(sess_id)) | |
| urls = [url for browser, url, status in self.browsers.values()] | |
| self.sessions[sess_id] = (sess_id, urls, True) | |
| if all([status for id, urls, status in self.sessions.values()]): | |
| print('All sessions finished') | |
| self.listener.on_finished(self.sessions.values()) | |
| self.app.quit() | |
| if all([status for browser, url, status in self.browsers.values()]): | |
| try: | |
| self.start_session(self.sessions[sess_id+1]) | |
| except: | |
| pass | |
| def browser_resources_received(self, browser_id, session_id, log): | |
| self.browsers[browser_id][0].resources.append(log) | |
| browser, url, status = self.browsers[browser_id] | |
| self.listener.on_resource_received(log, browser_id, browser) | |
| # if not log['is_blocked']: | |
| # self.resources.append(log) | |
| def start_session(self, session): | |
| sess_id, urls, is_processed = session | |
| print('Session {} started'.format(sess_id)) | |
| for id, url in enumerate(urls): | |
| browser = CrawlBrowser() | |
| browser.blacklists = self.blacklists | |
| browser.setPage(QWebPage()) | |
| network_fn = partial(self.browser_resources_received, id, sess_id) | |
| browser.page().setNetworkAccessManager( | |
| CrawlNetwork(browser, network_fn)) | |
| started_fn = partial(self.browser_started, id, sess_id) | |
| browser.loadStarted.connect(started_fn) | |
| loaded_fn = partial(self.browser_loaded, id, sess_id) | |
| browser.loadFinished.connect(loaded_fn) | |
| self.browsers[id] = (browser, url, False) | |
| for id, (browser, url, status) in self.browsers.items(): | |
| browser.load(QUrl(url)) | |
| def start(self, urls): | |
| if isinstance(urls, basestring) or isinstance(urls, unicode): | |
| urls = [urls,] | |
| urls = list(set(urls)) | |
| sessions_urls = [urls[x:x+10] for x in xrange(0, len(urls), 10)] | |
| for id, session_urls in enumerate(sessions_urls): | |
| self.sessions[id] = (id, session_urls, False) | |
| print('Processing {} urls in {} sessions'.format(len(urls), | |
| len(self.sessions))) | |
| self.start_session(self.sessions[0]) | |
| ''' | |
| Section 'Damage' ============================================================= | |
| ''' | |
| class SiteDamage: | |
| image_importance = 0 | |
| def __init__(self): | |
| pass | |
| def calculate_images_importance(self, str_images_log, page_size=(0,0), | |
| size_weight=0.5, centrality_weight=0.5): | |
| page_width, page_height = page_size | |
| for log in str_images_log: | |
| log = json.loads(log) | |
| for image_rect in log['rectangles']: | |
| location_importance = 0 | |
| # Based on measureMemento.pl line 703 | |
| if image_rect['left'] + image_rect['width'] > page_width/2 \ | |
| and image_rect['left'] < page_width/2: | |
| location_importance += centrality_weight/2 | |
| # Based on measureMemento.pl line 715 | |
| if image_rect['top'] + image_rect['height'] > page_height/2 \ | |
| and image_rect['top'] < page_height/2: | |
| location_importance += centrality_weight/2 | |
| # Based on measureMemento.pl line 729 | |
| img_size_to_page_size_prop = (image_rect['width'] * \ | |
| image_rect['height']) / \ | |
| (page_width * page_height) | |
| size_importance = img_size_to_page_size_prop * size_weight | |
| self.image_importance = size_importance + location_importance | |
| if __name__ == "__main__": | |
| import sys | |
| import os | |
| from hashlib import md5 | |
| from PyQt4.QtGui import QApplication | |
| if len(sys.argv) > 0: | |
| if len(sys.argv) != 3: | |
| print('Usage :') | |
| print('python crawl.py <url> <output_dir>') | |
| exit() | |
| url = sys.argv[1] | |
| output_dir = sys.argv[2] | |
| class CustomCrawlListener(CrawlListener): | |
| def on_start(self, id, browser): | |
| print('Browser {} is starting crawl {}\n\n' | |
| .format(id, browser.page().mainFrame().requestedUrl())) | |
| self.timestart = datetime.now() | |
| def on_loaded(self, id, browser): | |
| url = str(browser.page().mainFrame().requestedUrl().toString()) | |
| hashed_url = md5(url).hexdigest() | |
| browser.get_html('{}.html'.format( | |
| os.path.join(output_dir, 'html', hashed_url))) | |
| browser.take_screenshot('{}.png'.format( | |
| os.path.join(output_dir, 'screenshot', hashed_url))) | |
| browser.get_images('{}.img.log'.format( | |
| os.path.join(output_dir, 'log', hashed_url))) | |
| browser.get_stylesheets('{}.css.log'.format( | |
| os.path.join(output_dir, 'log', hashed_url))) | |
| browser.get_resources('{}.log'.format( | |
| os.path.join(output_dir, 'log', hashed_url))) | |
| print('Browser {} is finished crawl {}\n\n' | |
| .format(id, url)) | |
| def on_resource_received(self, log, id, *browser): | |
| print('Browser {} receive resource {}\n\n' | |
| .format(id, log['url'])) | |
| def on_finished(self, sessions): | |
| self.timefinish = datetime.now() | |
| process_time = (self.timefinish - | |
| self.timestart) | |
| urls = [len(urls) for id, urls, status in sessions] | |
| print('All Finished\n\n') | |
| print('{} URIs has crawled in {} sessions in {} ' | |
| 'miliseconds\n\n'.format(sum(urls), len(urls), | |
| process_time)) | |
| app = QApplication([]) | |
| crawler = Crawler(app, CustomCrawlListener()) | |
| crawler.add_blacklist('http://web.archive.org/static') | |
| crawler.start(url) | |
| app.exec_() | |