Skip to content

Instantly share code, notes, and snippets.

@jhezjkp
Last active July 24, 2016 05:02
Show Gist options
  • Save jhezjkp/4418542 to your computer and use it in GitHub Desktop.
Save jhezjkp/4418542 to your computer and use it in GitHub Desktop.
豆瓣/花瓣相册照片下载脚本
#!/usr/bin/env python
#encoding=utf-8
'''
豆瓣相册下载脚本
需要requests和BeautifulSoup支持
缩略图:http://img3.douban.com/view/photo/thumb/public/pXXXXXX.jpg
一般相册图:http://img3.douban.com/view/photo/photo/public/pXXXX.jpg
大图(只有部分图片有):http://img3.douban.com/view/photo/large/public/pXXXX.jpg
'''
import sys
import os
import os.path
import urllib
import requests
from BeautifulSoup import BeautifulSoup
def downloadPhoto(title, url, localPath):
filename = ""
if title:
filename = title.rstrip(".。") + ".jpg"
else:
filename = url[url.rfind("/") + 1:]
print "downloading ", filename, "..."
urllib.urlretrieve(url, os.path.join(localPath, filename))
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
if len(sys.argv) == 1:
print "Usage:", __file__, "<douban_gallery_url>"
sys.exit(0)
galleryUrl = sys.argv[1]
if not galleryUrl.endswith("/"):
galleryUrl += "/"
r = requests.get(galleryUrl)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
galleryTitle = soup.html.head.title.string.strip()
galleryTitle = galleryTitle[galleryTitle.find('-') + 1:].rstrip("?.。")
totalPage = 1
#检查分页情况
pageTag = soup.find("div", {"class": "paginator"})
if pageTag:
totalPage = len(pageTag.findAll("a"))
#创建相册目录
basePath = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(basePath, galleryTitle)
if not os.path.exists(path):
os.mkdir(path)
#解析照片地址
for page in range(0, totalPage):
r = requests.get(galleryUrl + "?start=" + str(page * 18))
if r.status_code == 200:
soup = BeautifulSoup(r.text)
for tag in soup.findAll("a", {"class": "photolst_photo"}):
downloadPhoto(tag['title'].strip(), tag.find("img")['src'].replace('thumb', 'photo'), path)
#!/usr/bin/env python
#encoding=utf-8
'''
花瓣图册下载
'''
import sys
import os
import os.path
import urllib
import json
import requests
from BeautifulSoup import BeautifulSoup
def download_photo(title, suffix, url, localPath):
filename = ""
if title:
filename = title.rstrip(".。") + "." + suffix
else:
filename = url[url.rfind("/") + 1:]
print "downloading ", filename, "..."
urllib.urlretrieve(url, os.path.join(localPath, filename))
def parset_to_json(text):
key_word = "app.page[\"board\"] = "
index = text.find(key_word)
text = text[index + len(key_word):]
key_word = "app._csr = true"
index = text.find(key_word)
if index > -1:
text = text[:index]
text = text.strip()
#print text
if text.endswith(";"):
text = text[:len(text) - 1]
data = json.loads(text)
return data
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
if len(sys.argv) == 1:
print "Usage:", __file__, "<huaban_gallery_url>"
sys.exit(0)
galleryUrl = sys.argv[1]
if not galleryUrl.endswith("/"):
galleryUrl += "/"
r = requests.get(galleryUrl)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
#print soup.body.script.string.strip()
text = soup.body.script.string.strip()
data = parset_to_json(text)
galleryTitle = data.get("title")
total_count = data.get("pin_count")
#创建相册目录
basePath = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(basePath, galleryTitle)
if not os.path.exists(path):
os.mkdir(path)
print data.get("title"), data.get("pin_count")
pic_array = data.get("pins")
while total_count > len(pic_array):
galleryUrl_rest = galleryUrl + "?her47ugy&max=" + str(pic_array[-1].get("pin_id")) + "&limit=100&wfl=1"
r = requests.get(galleryUrl_rest)
if r.status_code == 200:
data = parset_to_json(r.text.strip())
pic_array += data.get("pins")
for pic in pic_array:
#print pic.get("raw_text").strip(), pic.get("repin_count"), pic.get("orig_source"), pic.get("file").get("key")
key = pic.get("file").get("key")
mime_type = pic.get("file").get("type")
suffix = mime_type[mime_type.find("/") + 1:]
#print key, mime_type, suffix
download_photo(key, suffix, "http://img.hb.aicdn.com/" + key, path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment