Skip to content

Instantly share code, notes, and snippets.

@magicdawn
Created September 11, 2014 14:30
Show Gist options
  • Save magicdawn/3cf6b62d46ccbc591d37 to your computer and use it in GitHub Desktop.
Save magicdawn/3cf6b62d46ccbc591d37 to your computer and use it in GitHub Desktop.
tieba image downloader python3 version
#beautiful soup doc : http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
DEBUG = False #是否在调试
EXAMPLE_URL = 'http://tieba.baidu.com/p/3287469841' #示例网址
PAGE_ENCODING = "gbk" #返回的encoding,默认就好
IMAGE_DIR = 'image' #存放目录
def getTitle(soup): #获取标题,文件夹名
title = (soup.select("h1.core_title_txt"))[0]['title']
return title.strip()
def getImgSrcs(soup): #如何获取本页的img src
srcs = [img['src'] for img in soup.select("img.BDE_Image")]
return srcs
def has_next_page(soup): #还有下一页没啊
return False
def get_next_page(soup): #获取下一页地址
return ''
import urllib,urllib.request,os,sys
from bs4 import BeautifulSoup
from urllib.parse import urljoin
# import pyquery as pq
#page html
def request(url):
global PAGE_ENCODING
res = urllib.request.urlopen(url)
html = res.read()
if PAGE_ENCODING == None:
PAGE_ENCODING = "utf8"
return html.decode(PAGE_ENCODING)
#写入一行
def writeLine(f,content):
'''
f = open()
content : Unicode,use utf8 encode
'''
f.write(content.encode("utf8")+'\n')
if DEBUG: #在debug,取example_url
url = EXAMPLE_URL
elif len(sys.argv) == 1: #正常运行时,提示帮助信息
print("请指定图片所在网页(如{0})".format(EXAMPLE_URL))
exit()
else: #正常指定下载
url = sys.argv[1] #python [down.py http://xxxx]
soup = BeautifulSoup(request(url)) #创建soup
title = getTitle(soup) #如何获取标题,只获取当前页
print("图片系列为 : {0}".format(title))
#image文件夹
base_path = ''
if IMAGE_DIR == None or IMAGE_DIR == '':
#当前目录
base_path = title
else:
if not os.path.exists(IMAGE_DIR):
os.mkdir(IMAGE_DIR)
base_path = IMAGE_DIR + "/" + title + "/"
if os.path.exists(IMAGE_DIR + "/" + title):
answer = input("已经下载过了啊!,要重新下载 (y/n) ? ")
if answer != 'y' and answer != 'yes':
exit()
else:
os.mkdir(IMAGE_DIR + "/" + title + "/")
cur_url = url
srcs = [urljoin(url,src) for src in getImgSrcs(soup)]
#Get srcs
while has_next_page(soup):
next_url = get_next_page(soup) #获取下一页地址
next_url = urljoin(cur_url,next_url)
cur_url = next_url
soup = BeautifulSoup(request(next_url)) #构建下一页的soup
srcs = srcs.concat([urljoin(url,src) for src in getImgSrcs(soup)])
index = 1
for src in srcs:
#image/xxx-title/1.jpg
dot = src.rindex('.')
ext = src[dot:] # .jpg
path = base_path + "{0:02}".format(index) + ext
print("正在下载第{0:02}张 : {1}".format(index,src))
urllib.request.urlretrieve(src,path)
index+=1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment