Created
May 8, 2014 00:00
-
-
Save dodola/4d4cd82b298802026246 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'dodola' | |
#encoding: utf-8 | |
from time import sleep, ctime | |
import time | |
import urllib.request | |
import threading | |
import contextlib | |
import queue | |
import string | |
import shutil | |
import os | |
import mimetypes | |
import tempfile | |
import json | |
import glob | |
from urllib.error import URLError, HTTPError, ContentTooShortError | |
import re | |
from urllib.parse import ( | |
urlparse, urlsplit, urljoin, unwrap, quote, unquote, | |
splittype, splithost, splitport, splituser, splitpasswd, | |
splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse) | |
PINSURL = "http://huaban.com/pins/%s/zoom/" | |
BROADURL="http://huaban.com/boards/%s/" | |
NURL="http://huaban.com/boards/%s/?huwu7jsv&limit=20000&wfl=1" | |
UURL="http://huaban.com/%s/?huwzcasa&limit=1000&wfl=1" | |
DROOTURL="http://img.hb.aicdn.com/%s" | |
mimetypes.init() | |
def myurlretrieve(url, filename=None, reporthook=None, data=None): | |
_url_tempfiles = [] | |
url_type, path = splittype(url) | |
#user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)' | |
#headers = {'User-Agent': user_agent} | |
req = urllib.request.Request(url, data) | |
with contextlib.closing(urllib.request.urlopen(req)) as fp: | |
headers = fp.info() | |
if url_type == "file" and not filename: | |
return os.path.normpath(path), headers | |
# Handle temporary file setup. | |
if filename: | |
tfp = open(filename, 'wb') | |
else: | |
tfp = tempfile.NamedTemporaryFile(delete=False) | |
filename = tfp.name | |
_url_tempfiles.append(filename) | |
with tfp: | |
result = filename, headers | |
bs = 1024 * 8 | |
size = -1 | |
read = 0 | |
blocknum = 0 | |
if "content-length" in headers: | |
size = int(headers["Content-Length"]) | |
if reporthook: | |
reporthook(blocknum, 0, size) | |
while True: | |
block = fp.read(bs) | |
if not block: | |
break | |
read += len(block) | |
tfp.write(block) | |
blocknum += 1 | |
if reporthook: | |
reporthook(blocknum, len(block), size) | |
if size >= 0 and read < size: | |
raise ContentTooShortError( | |
"retrieval incomplete: got only %i out of %i bytes" | |
% (read, size), result) | |
return result | |
def validateTitle(title): | |
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/\:*?"<>|' | |
new_title = re.sub(rstr, "", title) | |
return new_title | |
def donwloadBroad(bid, savePath): | |
# broadUrl=BROADURL%bid | |
# print(broadUrl) | |
# broadRes=urllib.request.urlopen(broadUrl) | |
# broadContent=str(broadRes.read(),"utf-8") | |
# cr=r'app\.page\["board"\]\s=\s(.*?);' | |
# contentRe=re.compile(cr,re.M) | |
# match=contentRe.search(broadContent) | |
# | |
# if match: | |
# print (match.group(1)) | |
# | |
# firstJson=json.loads(match.group(1)) | |
# #获取列表 | |
# folderName = firstJson["title"] | |
# print(folderName) | |
# savePath = savePath + '/' + validateTitle(folderName) | |
# if not os.path.exists(savePath): | |
# os.makedirs(savePath) | |
# firstPins=firstJson["pins"] | |
# maxid = firstPins[0]["pin_id"] | |
# #解析当前列表 | |
# for firstPin in firstPins: | |
# pid=str(firstPin["pin_id"]) | |
# print("下载"+pid) | |
# type=firstPin["file"]["type"] | |
# ext = mimetypes.guess_extension(type) | |
# saveFilePath="%s/%s%s"%(savePath,pid,ext) | |
# if os.path.exists(saveFilePath): | |
# continue | |
# myurlretrieve(DROOTURL%firstPin["file"]["key"],saveFilePath) | |
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1870.2 Safari/537.36' | |
headers = { 'User-Agent' : user_agent,"Content-Type": "application/json;charset=UTF-8","X-Requested-With":"XMLHttpRequest","X-Request":"JSON"} | |
req=urllib.request.Request(NURL%(bid),headers=headers) | |
res=urllib.request.urlopen(req) | |
jsonstr=str(res.read(),"utf8") | |
print(jsonstr) | |
jsonData=json.loads(jsonstr) | |
jsonPins=jsonData["board"]["pins"] | |
folderName=jsonData["board"]["title"] | |
savePath = savePath + '/' + validateTitle(folderName) | |
if not os.path.exists(savePath): | |
os.makedirs(savePath) | |
for pin in jsonPins: | |
pid=str(pin["pin_id"]) | |
print("下载"+pid) | |
type=pin["file"]["type"] | |
ext = mimetypes.guess_extension(type,True) | |
saveFilePath="%s/%s%s"%(savePath,pid,ext) | |
if os.path.exists(saveFilePath): | |
continue | |
myurlretrieve(DROOTURL%pin["file"]["key"],saveFilePath) | |
# donwloadBroad("14405824","d:/huaban/") | |
# donwloadBroad("7472188","d:/huaban/") | |
# donwloadBroad("15801564","d:/huaban/") | |
# donwloadBroad("15815753","d:/huaban/") | |
# donwloadBroad("15816943","d:/huaban/") | |
# donwloadBroad("15814198","d:/huaban/") | |
# donwloadBroad("3879637","d:/huaban/") | |
def downloadUser(id): | |
user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1870.2 Safari/537.36' | |
headers = { 'User-Agent' : user_agent,"Content-Type": "application/json;charset=UTF-8","X-Requested-With":"XMLHttpRequest","X-Request":"JSON"} | |
req=urllib.request.Request(UURL%(id),headers=headers) | |
res=urllib.request.urlopen(req) | |
jsonstr=str(res.read(),"utf8") | |
print(jsonstr) | |
jsonData=json.loads(jsonstr) | |
boards=jsonData["user"]["boards"] | |
for board in boards: | |
donwloadBroad(board["board_id"],"L:/huaban2/") | |
downloadUser("gxpgxt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment