Last active
July 24, 2016 05:02
-
-
Save jhezjkp/4418542 to your computer and use it in GitHub Desktop.
豆瓣/花瓣相册照片下载脚本
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#encoding=utf-8 | |
''' | |
豆瓣相册下载脚本 | |
需要requests和BeautifulSoup支持 | |
缩略图:http://img3.douban.com/view/photo/thumb/public/pXXXXXX.jpg | |
一般相册图:http://img3.douban.com/view/photo/photo/public/pXXXX.jpg | |
大图(只有部分图片有):http://img3.douban.com/view/photo/large/public/pXXXX.jpg | |
''' | |
import sys | |
import os | |
import os.path | |
import urllib | |
import requests | |
from BeautifulSoup import BeautifulSoup | |
def downloadPhoto(title, url, localPath): | |
filename = "" | |
if title: | |
filename = title.rstrip(".。") + ".jpg" | |
else: | |
filename = url[url.rfind("/") + 1:] | |
print "downloading ", filename, "..." | |
urllib.urlretrieve(url, os.path.join(localPath, filename)) | |
if __name__ == '__main__': | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
if len(sys.argv) == 1: | |
print "Usage:", __file__, "<douban_gallery_url>" | |
sys.exit(0) | |
galleryUrl = sys.argv[1] | |
if not galleryUrl.endswith("/"): | |
galleryUrl += "/" | |
r = requests.get(galleryUrl) | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.text) | |
galleryTitle = soup.html.head.title.string.strip() | |
galleryTitle = galleryTitle[galleryTitle.find('-') + 1:].rstrip("?.。") | |
totalPage = 1 | |
#检查分页情况 | |
pageTag = soup.find("div", {"class": "paginator"}) | |
if pageTag: | |
totalPage = len(pageTag.findAll("a")) | |
#创建相册目录 | |
basePath = os.path.abspath(os.path.dirname(__file__)) | |
path = os.path.join(basePath, galleryTitle) | |
if not os.path.exists(path): | |
os.mkdir(path) | |
#解析照片地址 | |
for page in range(0, totalPage): | |
r = requests.get(galleryUrl + "?start=" + str(page * 18)) | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.text) | |
for tag in soup.findAll("a", {"class": "photolst_photo"}): | |
downloadPhoto(tag['title'].strip(), tag.find("img")['src'].replace('thumb', 'photo'), path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#encoding=utf-8 | |
''' | |
花瓣图册下载 | |
''' | |
import sys | |
import os | |
import os.path | |
import urllib | |
import json | |
import requests | |
from BeautifulSoup import BeautifulSoup | |
def download_photo(title, suffix, url, localPath): | |
filename = "" | |
if title: | |
filename = title.rstrip(".。") + "." + suffix | |
else: | |
filename = url[url.rfind("/") + 1:] | |
print "downloading ", filename, "..." | |
urllib.urlretrieve(url, os.path.join(localPath, filename)) | |
def parset_to_json(text): | |
key_word = "app.page[\"board\"] = " | |
index = text.find(key_word) | |
text = text[index + len(key_word):] | |
key_word = "app._csr = true" | |
index = text.find(key_word) | |
if index > -1: | |
text = text[:index] | |
text = text.strip() | |
#print text | |
if text.endswith(";"): | |
text = text[:len(text) - 1] | |
data = json.loads(text) | |
return data | |
if __name__ == '__main__': | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
if len(sys.argv) == 1: | |
print "Usage:", __file__, "<huaban_gallery_url>" | |
sys.exit(0) | |
galleryUrl = sys.argv[1] | |
if not galleryUrl.endswith("/"): | |
galleryUrl += "/" | |
r = requests.get(galleryUrl) | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.text) | |
#print soup.body.script.string.strip() | |
text = soup.body.script.string.strip() | |
data = parset_to_json(text) | |
galleryTitle = data.get("title") | |
total_count = data.get("pin_count") | |
#创建相册目录 | |
basePath = os.path.abspath(os.path.dirname(__file__)) | |
path = os.path.join(basePath, galleryTitle) | |
if not os.path.exists(path): | |
os.mkdir(path) | |
print data.get("title"), data.get("pin_count") | |
pic_array = data.get("pins") | |
while total_count > len(pic_array): | |
galleryUrl_rest = galleryUrl + "?her47ugy&max=" + str(pic_array[-1].get("pin_id")) + "&limit=100&wfl=1" | |
r = requests.get(galleryUrl_rest) | |
if r.status_code == 200: | |
data = parset_to_json(r.text.strip()) | |
pic_array += data.get("pins") | |
for pic in pic_array: | |
#print pic.get("raw_text").strip(), pic.get("repin_count"), pic.get("orig_source"), pic.get("file").get("key") | |
key = pic.get("file").get("key") | |
mime_type = pic.get("file").get("type") | |
suffix = mime_type[mime_type.find("/") + 1:] | |
#print key, mime_type, suffix | |
download_photo(key, suffix, "http://img.hb.aicdn.com/" + key, path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment