Created
June 19, 2017 09:06
-
-
Save mauromarano/be5c9f22731ec8e360becd43ea797c44 to your computer and use it in GitHub Desktop.
Get a screenshot of a webpage with python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#to install PyQt4 on mac brew install cartr/qt4/pyqt | |
#to remove javascripts tag from html https://stackoverflow.com/questions/8554035/remove-all-javascript-tags-and-style-tags-from-html-with-python-and-the-lxml-mod | |
import sys | |
import time | |
from PyQt4.QtCore import * | |
from PyQt4.QtGui import * | |
from PyQt4.QtWebKit import * | |
import lxml | |
from lxml.html.clean import Cleaner | |
def clean_js_and_css(url): | |
cleaner = Cleaner() | |
cleaner.javascript = True # This is True because we want to activate the javascript filter | |
return lxml.html.tostring(cleaner.clean_html(lxml.html.parse(url))) | |
def download_screenshot_and_html(url, name): | |
html = clean_js_and_css(url) | |
Screenshot().capture(url,name+".png") | |
with open(name+".html", 'w') as text: | |
text.write(html) | |
class Screenshot(QWebView): | |
def __init__(self): | |
self.app = QApplication(sys.argv) | |
QWebView.__init__(self) | |
self._loaded = False | |
self.loadFinished.connect(self._loadFinished) | |
def capture(self, url, output_file): | |
self.load(QUrl(url)) | |
self.wait_load() | |
# set to webpage size | |
frame = self.page().mainFrame() | |
self.page().setViewportSize(frame.contentsSize()) | |
# render image | |
image = QImage(self.page().viewportSize(), QImage.Format_ARGB32) | |
painter = QPainter(image) | |
frame.render(painter) | |
painter.end() | |
print 'saving', output_file | |
image.save(output_file) | |
def wait_load(self, delay=0): | |
# process app events until page loaded | |
while not self._loaded: | |
self.app.processEvents() | |
time.sleep(delay) | |
self._loaded = False | |
def _loadFinished(self, result): | |
self._loaded = True | |
download_screenshot_and_html('http://gazzetta.it', 'gazzetta') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment