Last active
August 29, 2015 14:15
-
-
Save muse/fcc673e1015a2f98694a to your computer and use it in GitHub Desktop.
Download all the Images from a ehentai /g/allery.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# Made by 'Mirko van der Waal' | |
# Distributed under terms of the MIT license. | |
try: | |
from HTMLParser import HTMLParser | |
from urllib2 import urlopen | |
from re import search | |
from os import getenv, mkdir | |
from os.path import join as merge | |
from sys import argv, exit | |
from uuid import uuid4 | |
import getopt | |
except ImportError as e: | |
print e, exit(0) | |
URL = '' | |
IMAGE_URLS = [] | |
IMAGES = [] | |
PAGE_URLS = 40 | |
TIMEOUT = 15 | |
PREFIX = 'image-' | |
FORMAT = '.jpg' | |
OUTPUT_REDUCTION = 32 | |
SILENT = False | |
TITLE = str(uuid4())[:6] | |
HOME_DIR = getenv('HOME') | |
OUT_DIR = merge(HOME_DIR + '/Pictures') | |
class HTMLParser(HTMLParser): | |
def handle_starttag(self, tag, attrs): | |
# For every attribute in attributes, we test them to conditions. | |
for attr in attrs: | |
try: | |
# Is the parameter equal to 'href' and does the value match | |
# the /s/ regex. /s/ looks for hrefs that contain the images. | |
# As the images are stored in this urlpath. | |
# Some other paths I know are; | |
# /tag/<tagname>, /uploader/<username>, /g/<id>/<id> | |
if search('/s/', attr[1]).group(0): | |
# Write the URL to a array of images for us to open later. | |
IMAGE_URLS.append(attr[1]) | |
# Occasionally, -most of the time; we we gain NoneType because | |
# there is not match. We simply 'pass' this off because it isn't useful. | |
except: | |
pass | |
# The default GETOPT module. | |
try: | |
opts, args = getopt.getopt(argv[1:], | |
':hu:n:p:x:f:o:t:i:s', | |
['url=', 'name=', 'path=', 'prefix=', 'format=', | |
'output-reduction=', 'timeout=', 'image-limit=', 'silent', 'help']) | |
except Exception as e: | |
print e, exit(0) | |
for o, a in opts: | |
if o in ('-u', '--url'): URL = a | |
elif o in ('-n', '--name'): TITLE = a | |
elif o in ('-p', '--path'): OUT_DIR = a | |
elif o in ('-i', '--image-limit'): | |
PAGE_URLS = a | |
elif o in ('-t', '--timeout'): TIMEOUT = a | |
elif o in ('-x', '--prefix'): PREFIX = a | |
elif o in ('-f', '--format'): FORMAT = a | |
elif o in ('-s', '--silent'): SILENT = True | |
elif o in ('-o', '--output-reduction'): | |
OUTPUT_REDUCTION = a | |
elif o in ('-h', '--help'): | |
print """ | |
-u, --url <value> | |
The gallery you want to download, make sure to supply a full url. | |
This is the one and only required parameter. | |
[Default: Input] | |
-n, --name <value> | |
The folder name where we extract the content. | |
[Default: A UUID4 6 character string.] | |
-p, --path <value> | |
The path where to extract the Images to. | |
[Default: ~/Pictures|$HOME/Pictures] | |
-x, --prefix <value> | |
The downloaded Image prefix. | |
[Default: image-<index>] | |
-f, --format <value> | |
The image format to save as. | |
[Default: .jpg] | |
-o, --output-reduction <value> | |
Reduce the length of the original source url when outputting the process. | |
[Default: 32] | |
-t, --timeout <value> | |
Set the timeout value for when downloading takes to long. | |
This may occur with unstable hosts. | |
[Default: 15] | |
-i, --image-limit <value> | |
Do not change this unless you used hath to change to rows. | |
[Default: 40] | |
-s, --silent | |
Silence all output while downloading. | |
[Default: False] | |
-h, --help | |
This. | |
""" | |
exit(0) | |
# Create the HTMLParser | |
Parser = HTMLParser() | |
# Fallback for non-existent -u parameter. | |
if URL == '':URL = raw_input('Gallery: ') | |
# The final directory where this batch of images is going to be saved. | |
MERGED = merge(OUT_DIR + '/' + TITLE) | |
# Due to the existence of bigger galleries (duh). We had to go to every page. | |
# We first count the images and split them through images per page. | |
# Then use that number to indicate the amount of pages and use url changing- | |
# to go to every page. | |
if not SILENT: print '[..] Obtaining pages' | |
PAGES = int(search('\d+ @', str(urlopen(URL).read())).group(0).replace(' ', '').replace('@', ''))//40 | |
for i in range(PAGES + 1): | |
try: | |
Parser.feed(urlopen(URL + '?p=%s'%str(i)).read()) | |
except Exception as e: | |
print e, exit(0) | |
if not SILENT: print '[OK] Succes' | |
# We get some unique url paths, but ehentai uses a external hentai hosting system. | |
# Where users are encouraged to host hentai for them in exchange for various benefits. | |
# This means we have to nest really deep to get the source. | |
if not SILENT: print '[..] Splitting images from their container' | |
for U in IMAGE_URLS: | |
u = search('http:\/\/\d+\.\d+\.\d+\.\d+[^"]+', urlopen(U).read()) | |
IMAGES.append(u.group(0)) | |
if not SILENT: print '[OK] Succes' | |
if not SILENT: print '[..] Making directory (%s)'%TITLE | |
try: | |
mkdir(MERGED) | |
except Exception as e: | |
print e, exit(0) | |
if not SILENT: print '[OK] Succes' | |
for ind, image in enumerate(IMAGES): | |
if not SILENT:print '%i/%s\t%s\t%s%s'%(ind+1,len(IMAGES),IMAGES[ind][:int(OUTPUT_REDUCTION)],''.join([MERGED,'/',PREFIX,str(ind+1)]),FORMAT) | |
with open(MERGED+'/'+PREFIX+str(ind)+FORMAT,"wb")as output: | |
try: | |
output.write(urlopen(image, timeout=15).read()) | |
except Exception: | |
if not SILENT: print 'A unstable host caused a timeout (%i/%s).'%(ind+1,len(IMAGES)) | |
if not SILENT: print '[OK] Finished succesfully' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment