Created
May 8, 2014 00:03
-
-
Save dodola/74c5d067f0fe68220ead to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'dodola' | |
#encoding: utf-8 | |
from lxml.html import parse | |
from time import sleep, ctime | |
import time | |
import urllib.request | |
import threading | |
import contextlib | |
import queue | |
import string | |
import shutil | |
import os | |
import mimetypes | |
import tempfile | |
import json | |
import glob | |
from urllib.error import URLError, HTTPError, ContentTooShortError | |
import re | |
from urllib.parse import ( | |
urlparse, urlsplit, urljoin, unwrap, quote, unquote, | |
splittype, splithost, splitport, splituser, splitpasswd, | |
splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse) | |
PINSURL = "http://huaban.com/pins/%s/zoom/" | |
BROADURL="http://huaban.com/boards/%s/" | |
NURL="http://huaban.com/boards/%s/?huwu7jsv&limit=20000&wfl=1" | |
DROOTURL="http://img.hb.aicdn.com/%s" | |
mimetypes.init() | |
def myurlretrieve(url, filename=None, reporthook=None, data=None): | |
_url_tempfiles = [] | |
url_type, path = splittype(url) | |
#user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)' | |
#headers = {'User-Agent': user_agent} | |
req = urllib.request.Request(url, data) | |
with contextlib.closing(urllib.request.urlopen(req)) as fp: | |
headers = fp.info() | |
if url_type == "file" and not filename: | |
return os.path.normpath(path), headers | |
# Handle temporary file setup. | |
if filename: | |
tfp = open(filename, 'wb') | |
else: | |
tfp = tempfile.NamedTemporaryFile(delete=False) | |
filename = tfp.name | |
_url_tempfiles.append(filename) | |
with tfp: | |
result = filename, headers | |
bs = 1024 * 8 | |
size = -1 | |
read = 0 | |
blocknum = 0 | |
if "content-length" in headers: | |
size = int(headers["Content-Length"]) | |
if reporthook: | |
reporthook(blocknum, 0, size) | |
while True: | |
block = fp.read(bs) | |
if not block: | |
break | |
read += len(block) | |
tfp.write(block) | |
blocknum += 1 | |
if reporthook: | |
reporthook(blocknum, len(block), size) | |
if size >= 0 and read < size: | |
raise ContentTooShortError( | |
"retrieval incomplete: got only %i out of %i bytes" | |
% (read, size), result) | |
return result | |
def validateTitle(title): | |
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/\:*?"<>|' | |
new_title = re.sub(rstr, "", title) | |
return new_title | |
# //div[@class='size13']/a/@href | |
ROOTURL="http://www.facets.la/view/%d/" | |
def download(): | |
for i in range(100,365): | |
itemContent=parse(ROOTURL%i) | |
print("下载第%d个"%i) | |
downloads=itemContent.xpath("//div[@class='size13']/a/@href") | |
if len(downloads)==0: | |
continue | |
downloadUrl=downloads[0] | |
savePath="/home/dodola/facets/%d.jpg"%i; | |
if os.path.exists(savePath): | |
continue | |
myurlretrieve(downloadUrl,savePath) | |
download() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment