Created
February 1, 2016 04:59
-
-
Save vitorio/66b3fbd9930aeb2562e5 to your computer and use it in GitHub Desktop.
(2011) wget + moz-headless-screenshot to take screenshots of archived URLs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Assumptions: | |
# moz-headless-screenshot in ./bin | |
# wget 1.12 in system path | |
# Ubuntu 10.10 defaults to Python 2.6.6 so we provide 2.7's subprocess module ourselves | |
# I guess we could also install 2.7 | |
import optparse, urlparse, tempfile, subprocess271 as subprocess, os.path | |
parser = optparse.OptionParser() | |
options, args = parser.parse_args() | |
if args and args[0]: | |
url = urlparse.urlparse(args[0]).geturl() | |
if url: | |
useragent = 'Mozilla/5.0 (Windows; U; Windows NT 5.2; rv:1.9.2) Gecko/20100101 Firefox/3.6' | |
downloaddir = tempfile.mkdtemp() | |
output = subprocess.check_output(['wget', | |
'--page-requisites', | |
'--span-hosts', | |
'--convert-links', | |
'--wait=1', | |
'--random-wait', | |
'--force-directories', | |
'--adjust-extension', | |
'--no-verbose', | |
'--execute', 'robots=off', | |
'--user-agent="%s"' % useragent, | |
'--restrict-file-names=ascii', | |
'--directory-prefix=%s' % downloaddir, | |
url], | |
stderr=subprocess.STDOUT) | |
# 2011-03-30 07:23:52 URL:http://vi.to/ [8924/8924] -> "vi.to/index.html" [1] | |
# is this more or less brittle than the urlbot regex? | |
# m|\d\d:\d\d:\d\d URL:.+ \[\d+/?\d+\] -> \"([^\"]+)\" \[\d+\]| | |
# not that I know how to turn that into Python code | |
indexfile = output.splitlines()[0].split('->')[1].split()[0].strip('"') | |
savedpage = os.path.join(downloaddir, indexfile) | |
thumbnail = tempfile.mkstemp('.png') | |
os.close(thumbnail[0]) | |
# moz-headless-screenshot segfaults unless you run it from within bin | |
previousdir = os.getcwd() | |
os.chdir('bin') | |
try: | |
output = subprocess.check_output(['./moz-headless-screenshot', | |
savedpage, | |
'1024', | |
'768', | |
thumbnail[1]]) | |
except subprocess.CalledProcessError, e: | |
if e.returncode is -11: | |
pass | |
os.chdir(previousdir) | |
print indexfile | |
print thumbnail[1] | |
# need to shutil's delete the temp dir and PNG after we do something with them |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment