Created
September 11, 2014 14:30
-
-
Save magicdawn/3cf6b62d46ccbc591d37 to your computer and use it in GitHub Desktop.
tieba image downloader python3 version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#beautiful soup doc : http://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html | |
DEBUG = False #是否在调试 | |
EXAMPLE_URL = 'http://tieba.baidu.com/p/3287469841' #示例网址 | |
PAGE_ENCODING = "gbk" #返回的encoding,默认就好 | |
IMAGE_DIR = 'image' #存放目录 | |
def getTitle(soup): #获取标题,文件夹名 | |
title = (soup.select("h1.core_title_txt"))[0]['title'] | |
return title.strip() | |
def getImgSrcs(soup): #如何获取本页的img src | |
srcs = [img['src'] for img in soup.select("img.BDE_Image")] | |
return srcs | |
def has_next_page(soup): #还有下一页没啊 | |
return False | |
def get_next_page(soup): #获取下一页地址 | |
return '' | |
import urllib,urllib.request,os,sys | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
# import pyquery as pq | |
#page html | |
def request(url): | |
global PAGE_ENCODING | |
res = urllib.request.urlopen(url) | |
html = res.read() | |
if PAGE_ENCODING == None: | |
PAGE_ENCODING = "utf8" | |
return html.decode(PAGE_ENCODING) | |
#写入一行 | |
def writeLine(f,content): | |
''' | |
f = open() | |
content : Unicode,use utf8 encode | |
''' | |
f.write(content.encode("utf8")+'\n') | |
if DEBUG: #在debug,取example_url | |
url = EXAMPLE_URL | |
elif len(sys.argv) == 1: #正常运行时,提示帮助信息 | |
print("请指定图片所在网页(如{0})".format(EXAMPLE_URL)) | |
exit() | |
else: #正常指定下载 | |
url = sys.argv[1] #python [down.py http://xxxx] | |
soup = BeautifulSoup(request(url)) #创建soup | |
title = getTitle(soup) #如何获取标题,只获取当前页 | |
print("图片系列为 : {0}".format(title)) | |
#image文件夹 | |
base_path = '' | |
if IMAGE_DIR == None or IMAGE_DIR == '': | |
#当前目录 | |
base_path = title | |
else: | |
if not os.path.exists(IMAGE_DIR): | |
os.mkdir(IMAGE_DIR) | |
base_path = IMAGE_DIR + "/" + title + "/" | |
if os.path.exists(IMAGE_DIR + "/" + title): | |
answer = input("已经下载过了啊!,要重新下载 (y/n) ? ") | |
if answer != 'y' and answer != 'yes': | |
exit() | |
else: | |
os.mkdir(IMAGE_DIR + "/" + title + "/") | |
cur_url = url | |
srcs = [urljoin(url,src) for src in getImgSrcs(soup)] | |
#Get srcs | |
while has_next_page(soup): | |
next_url = get_next_page(soup) #获取下一页地址 | |
next_url = urljoin(cur_url,next_url) | |
cur_url = next_url | |
soup = BeautifulSoup(request(next_url)) #构建下一页的soup | |
srcs = srcs.concat([urljoin(url,src) for src in getImgSrcs(soup)]) | |
index = 1 | |
for src in srcs: | |
#image/xxx-title/1.jpg | |
dot = src.rindex('.') | |
ext = src[dot:] # .jpg | |
path = base_path + "{0:02}".format(index) + ext | |
print("正在下载第{0:02}张 : {1}".format(index,src)) | |
urllib.request.urlretrieve(src,path) | |
index+=1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment