dodola · May 8, 2014 00:00
diff --git a/huaban.py b/huaban.py
 __author__ = 'dodola'
 #encoding: utf-8
 from time import sleep, ctime
 import time
 import urllib.request
 import threading
 import contextlib
 import queue
 import string
 import shutil
 import os
 import mimetypes
 import tempfile
 import json
 import glob
 from urllib.error import URLError, HTTPError, ContentTooShortError
 import re
 from urllib.parse import (
    urlparse, urlsplit, urljoin, unwrap, quote, unquote,
    splittype, splithost, splitport, splituser, splitpasswd,
    splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)

 PINSURL = "http://huaban.com/pins/%s/zoom/"
 BROADURL="http://huaban.com/boards/%s/"
 NURL="http://huaban.com/boards/%s/?huwu7jsv&limit=20000&wfl=1"
 UURL="http://huaban.com/%s/?huwzcasa&limit=1000&wfl=1"
 DROOTURL="http://img.hb.aicdn.com/%s"
 mimetypes.init()


 def myurlretrieve(url, filename=None, reporthook=None, data=None):
    _url_tempfiles = []
    url_type, path = splittype(url)

    #user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'
    #headers = {'User-Agent': user_agent}
    req = urllib.request.Request(url, data)

    with contextlib.closing(urllib.request.urlopen(req)) as fp:
        headers = fp.info()

        if url_type == "file" and not filename:
            return os.path.normpath(path), headers

        # Handle temporary file setup.
        if filename:
            tfp = open(filename, 'wb')
        else:
            tfp = tempfile.NamedTemporaryFile(delete=False)
            filename = tfp.name
            _url_tempfiles.append(filename)

        with tfp:
            result = filename, headers
            bs = 1024 * 8
            size = -1
            read = 0
            blocknum = 0
            if "content-length" in headers:
                size = int(headers["Content-Length"])

            if reporthook:
                reporthook(blocknum, 0, size)

            while True:
                block = fp.read(bs)
                if not block:
                    break
                read += len(block)
                tfp.write(block)
                blocknum += 1
                if reporthook:
                    reporthook(blocknum, len(block), size)

    if size >= 0 and read < size:
        raise ContentTooShortError(
            "retrieval incomplete: got only %i out of %i bytes"
            % (read, size), result)

    return result


 def validateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/\:*?"<>|'
    new_title = re.sub(rstr, "", title)
    return new_title


 def donwloadBroad(bid, savePath):
    # broadUrl=BROADURL%bid
    # print(broadUrl)
    # broadRes=urllib.request.urlopen(broadUrl)
    # broadContent=str(broadRes.read(),"utf-8")
    # cr=r'app\.page\["board"\]\s=\s(.*?);'
    # contentRe=re.compile(cr,re.M)
    # match=contentRe.search(broadContent)
    #
    # if match:
    #     print (match.group(1))
    #
    # firstJson=json.loads(match.group(1))
    # #获取列表
    # folderName = firstJson["title"]
    # print(folderName)
    # savePath = savePath + '/' + validateTitle(folderName)
    # if not os.path.exists(savePath):
    #     os.makedirs(savePath)
    # firstPins=firstJson["pins"]
    # maxid = firstPins[0]["pin_id"]
    # #解析当前列表
    # for firstPin in firstPins:
    #     pid=str(firstPin["pin_id"])
    #     print("下载"+pid)
    #     type=firstPin["file"]["type"]
    #     ext = mimetypes.guess_extension(type)
    #     saveFilePath="%s/%s%s"%(savePath,pid,ext)
    #     if os.path.exists(saveFilePath):
    #         continue
    #     myurlretrieve(DROOTURL%firstPin["file"]["key"],saveFilePath)

    user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1870.2 Safari/537.36'
    headers = { 'User-Agent' : user_agent,"Content-Type": "application/json;charset=UTF-8","X-Requested-With":"XMLHttpRequest","X-Request":"JSON"}
    req=urllib.request.Request(NURL%(bid),headers=headers)
    res=urllib.request.urlopen(req)
    jsonstr=str(res.read(),"utf8")
    print(jsonstr)
    jsonData=json.loads(jsonstr)
    jsonPins=jsonData["board"]["pins"]
    folderName=jsonData["board"]["title"]
    savePath = savePath + '/' + validateTitle(folderName)
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    for pin in jsonPins:
        pid=str(pin["pin_id"])
        print("下载"+pid)
        type=pin["file"]["type"]
        ext = mimetypes.guess_extension(type,True)
        saveFilePath="%s/%s%s"%(savePath,pid,ext)
        if os.path.exists(saveFilePath):
            continue
        myurlretrieve(DROOTURL%pin["file"]["key"],saveFilePath)



 # donwloadBroad("14405824","d:/huaban/")
 # donwloadBroad("7472188","d:/huaban/")
 # donwloadBroad("15801564","d:/huaban/")
 # donwloadBroad("15815753","d:/huaban/")
 # donwloadBroad("15816943","d:/huaban/")
 # donwloadBroad("15814198","d:/huaban/")
 # donwloadBroad("3879637","d:/huaban/")
 def downloadUser(id):
    user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1870.2 Safari/537.36'
    headers = { 'User-Agent' : user_agent,"Content-Type": "application/json;charset=UTF-8","X-Requested-With":"XMLHttpRequest","X-Request":"JSON"}
    req=urllib.request.Request(UURL%(id),headers=headers)
    res=urllib.request.urlopen(req)
    jsonstr=str(res.read(),"utf8")
    print(jsonstr)
    jsonData=json.loads(jsonstr)
    boards=jsonData["user"]["boards"]
    for board in boards:
        donwloadBroad(board["board_id"],"L:/huaban2/")

 downloadUser("gxpgxt")
	__author__ = 'dodola'
	#encoding: utf-8
	from time import sleep, ctime
	import time
	import urllib.request
	import threading
	import contextlib
	import queue
	import string
	import shutil
	import os
	import mimetypes
	import tempfile
	import json
	import glob
	from urllib.error import URLError, HTTPError, ContentTooShortError
	import re
	from urllib.parse import (
	urlparse, urlsplit, urljoin, unwrap, quote, unquote,
	splittype, splithost, splitport, splituser, splitpasswd,
	splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)

	PINSURL = "http://huaban.com/pins/%s/zoom/"
	BROADURL="http://huaban.com/boards/%s/"
	NURL="http://huaban.com/boards/%s/?huwu7jsv&limit=20000&wfl=1"
	UURL="http://huaban.com/%s/?huwzcasa&limit=1000&wfl=1"
	DROOTURL="http://img.hb.aicdn.com/%s"
	mimetypes.init()


	def myurlretrieve(url, filename=None, reporthook=None, data=None):
	_url_tempfiles = []
	url_type, path = splittype(url)

	#user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'
	#headers = {'User-Agent': user_agent}
	req = urllib.request.Request(url, data)

	with contextlib.closing(urllib.request.urlopen(req)) as fp:
	headers = fp.info()

	if url_type == "file" and not filename:
	return os.path.normpath(path), headers

	# Handle temporary file setup.
	if filename:
	tfp = open(filename, 'wb')
	else:
	tfp = tempfile.NamedTemporaryFile(delete=False)
	filename = tfp.name
	_url_tempfiles.append(filename)

	with tfp:
	result = filename, headers
	bs = 1024 * 8
	size = -1
	read = 0
	blocknum = 0
	if "content-length" in headers:
	size = int(headers["Content-Length"])

	if reporthook:
	reporthook(blocknum, 0, size)

	while True:
	block = fp.read(bs)
	if not block:
	break
	read += len(block)
	tfp.write(block)
	blocknum += 1
	if reporthook:
	reporthook(blocknum, len(block), size)

	if size >= 0 and read < size:
	raise ContentTooShortError(
	"retrieval incomplete: got only %i out of %i bytes"
	% (read, size), result)

	return result


	def validateTitle(title):
	rstr = r"[\/\\\:\\?\"\<\>\\|]" # '/\:?"<>\|'
	new_title = re.sub(rstr, "", title)
	return new_title


	def donwloadBroad(bid, savePath):
	# broadUrl=BROADURL%bid
	# print(broadUrl)
	# broadRes=urllib.request.urlopen(broadUrl)
	# broadContent=str(broadRes.read(),"utf-8")
	# cr=r'app\.page\["board"\]\s=\s(.*?);'
	# contentRe=re.compile(cr,re.M)
	# match=contentRe.search(broadContent)
	#
	# if match:
	# print (match.group(1))
	#
	# firstJson=json.loads(match.group(1))
	# #获取列表
	# folderName = firstJson["title"]
	# print(folderName)
	# savePath = savePath + '/' + validateTitle(folderName)
	# if not os.path.exists(savePath):
	# os.makedirs(savePath)
	# firstPins=firstJson["pins"]
	# maxid = firstPins[0]["pin_id"]
	# #解析当前列表
	# for firstPin in firstPins:
	# pid=str(firstPin["pin_id"])
	# print("下载"+pid)
	# type=firstPin["file"]["type"]
	# ext = mimetypes.guess_extension(type)
	# saveFilePath="%s/%s%s"%(savePath,pid,ext)
	# if os.path.exists(saveFilePath):
	# continue
	# myurlretrieve(DROOTURL%firstPin["file"]["key"],saveFilePath)

	user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1870.2 Safari/537.36'
	headers = { 'User-Agent' : user_agent,"Content-Type": "application/json;charset=UTF-8","X-Requested-With":"XMLHttpRequest","X-Request":"JSON"}
	req=urllib.request.Request(NURL%(bid),headers=headers)
	res=urllib.request.urlopen(req)
	jsonstr=str(res.read(),"utf8")
	print(jsonstr)
	jsonData=json.loads(jsonstr)
	jsonPins=jsonData["board"]["pins"]
	folderName=jsonData["board"]["title"]
	savePath = savePath + '/' + validateTitle(folderName)
	if not os.path.exists(savePath):
	os.makedirs(savePath)
	for pin in jsonPins:
	pid=str(pin["pin_id"])
	print("下载"+pid)
	type=pin["file"]["type"]
	ext = mimetypes.guess_extension(type,True)
	saveFilePath="%s/%s%s"%(savePath,pid,ext)
	if os.path.exists(saveFilePath):
	continue
	myurlretrieve(DROOTURL%pin["file"]["key"],saveFilePath)



	# donwloadBroad("14405824","d:/huaban/")
	# donwloadBroad("7472188","d:/huaban/")
	# donwloadBroad("15801564","d:/huaban/")
	# donwloadBroad("15815753","d:/huaban/")
	# donwloadBroad("15816943","d:/huaban/")
	# donwloadBroad("15814198","d:/huaban/")
	# donwloadBroad("3879637","d:/huaban/")
	def downloadUser(id):
	user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1870.2 Safari/537.36'
	headers = { 'User-Agent' : user_agent,"Content-Type": "application/json;charset=UTF-8","X-Requested-With":"XMLHttpRequest","X-Request":"JSON"}
	req=urllib.request.Request(UURL%(id),headers=headers)
	res=urllib.request.urlopen(req)
	jsonstr=str(res.read(),"utf8")
	print(jsonstr)
	jsonData=json.loads(jsonstr)
	boards=jsonData["user"]["boards"]
	for board in boards:
	donwloadBroad(board["board_id"],"L:/huaban2/")

	downloadUser("gxpgxt")