Skip to content

Instantly share code, notes, and snippets.

@shellphon
Created January 11, 2014 02:35
Show Gist options
  • Save shellphon/8366271 to your computer and use it in GitHub Desktop.
Save shellphon/8366271 to your computer and use it in GitHub Desktop.
get pictures from douban albums by python3
#need beautifulsoup for parse the html text
#python
#
#获取相册页面,分析获取分页的url
#根据抓取的url分别做处理:将url页面中的真实图片名称获取
#将上一步的名称替换到指定的图片地址格式中,开始下载图片
#
import re
import os
import urllib.request
from bs4 import BeautifulSoup
def getSoup(str_doc):
'''根据传入的html文本生成soup对象'''
soup = BeautifulSoup(str_doc,from_encoding='gbk')
return soup
def getHtml(url):
'''根据url获取html文本'''
html = urllib.request.urlopen(url).read()
return html
def filterForFileName(string):
#\/?*<>:"| can't be filename in windows
# \n \r may be avoided
#http:... is not useful
string = string.strip()
regex = r"http:.*"
string = re.sub(regex, "", string) #因为看到有些描述还加了网址的东西影响文件名命名过滤掉了
string = string.replace('\n','').replace('\r','')
return fileNameFilter(string)
def fileNameFilter(string):
#windows下命名的禁止字符
string = string.replace('/','').replace('*','').replace('?','').replace('<','').replace('>','').replace('\\','').replace('|','').replace(':','').replace('"','')
return string
def pullImage(img_unit_list,direct):
'''img_unit img对象'''
direct = filterForFileName(direct)
os.mkdir(direct)#根据direct生成目录,还没加是否已存在目录的判断
x=0#计数用
#大图是.../photo/large/...
#小图是.../phtoo/photo/...
#主要是有些相册是没大图的
#img_1_str = "http://img3.douban.com/view/photo/large/public/"
img_1_str = "http://img3.douban.com/view/photo/photo/public/"
img_3_str = ".jpg"
for img_unit in img_unit_list:
img_src = img_unit.img_src
slash_index = img_src.rfind('/')
point_index = img_src.rfind('.')
img_name = img_src[slash_index+1:point_index]
img_url = img_1_str + img_name + img_3_str
img_file_name = filterForFileName(img_unit.img_name)
#下面是抓取第一个参数imgurl的文件,并以第二个参数命名文件
urllib.request.urlretrieve(img_url,direct+'/%s-'%x +img_file_name+'.jpg')
x+=1
print("总共%s张图片"%x)
class imgunit(object):
#存图片的路径和描述 -- 临时想到的映射方式
img_src=''
img_name=''
def __init__(self,imgObj,aObj):
#print(imgObj['src'])
self.img_src = imgObj['src']
self.img_name = aObj['title']
#first_page_url = "http://www.douban.com/photos/album/85320662/?start=0"
#"http://www.douban.com/photos/album/85320662/"
#first_page_url = 'http://www.douban.com/photos/album/85320662/'
#替换下列链接即可:相册链接
first_page_url = 'http://www.douban.com/photos/album/41149992/?start=0'
album_html=getHtml(first_page_url)
album_soup = getSoup(album_html)
#分页的链接
album_pages_url = []
album_pages_url.append(first_page_url)
#用作目录名
directory = album_soup.find('title').get_text()
#获取paginator id的div内容获取所有的a标签的 href
if album_soup.find('div','paginator')!=None:
albums_a = album_soup.find('div','paginator').find_all("a")
for page_a in albums_a:
album_pages_url.append(page_a['href'])
#print(album_pages_url)
#由于取的是翻页部分的代码,因此需要将由于后页多余的一页的元素给移除
album_pages_url.pop()
#存储 图片相关数据
img_units = []
for page_url in album_pages_url:
print(page_url)
page_html = getHtml(page_url)
page_soup = getSoup(page_html)
img_wrap_divs = page_soup.find_all('div','photo_wrap')
y=0
for img_wrap_div in img_wrap_divs:
y+=1
#print(y)
img_units.append(imgunit(img_wrap_div.find('img'),img_wrap_div.find('a')))
print(len(img_units))
pullImage(img_units, directory)
@honwhy
Copy link

honwhy commented Aug 13, 2014

赞赞赞。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment