Last active
August 29, 2015 14:24
-
-
Save tigerwang202/c656d12b87fffa9cc351 to your computer and use it in GitHub Desktop.
Fetch ebooks on 61eda website. Need beautifulSoup4 wget
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import sys | |
import requests | |
import urlparse | |
import os | |
import time | |
from bs4 import BeautifulSoup | |
agent_string = '''"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; \ | |
Acoo Browser 1.98.744; .NET CLR 3.5.30729)"''' | |
url_base = "http://www.61eda.com/" | |
# 下载文件 | |
# Fix:跳过已下载文件(除非服务器上有更新的文件) | |
def downloadfile(url): | |
os.system("wget --tries=15 " | |
"--retry-connrefused " | |
"--waitretry=8 " | |
"--connect-timeout=5 " | |
"--wait=3 " | |
"--timeout=120 " | |
"-N " | |
"--user-agent=%s \"%s\"" % (agent_string, url.encode('gbk'))) | |
return | |
def main(): | |
# 解析命令行输入 | |
if len(sys.argv) != 2: | |
print("fetch ebook from 61eda.com") | |
print("usage: fetch61eda.py {page url}") | |
print("for example: http://www.61eda.com/Soft/book/cnbook/") | |
exit() | |
url = sys.argv[1] | |
if not url.startswith("http://"): | |
url = "http://" + url | |
while True: | |
# 下载网页 | |
try: | |
print("fetch from: %s" % url) | |
r = requests.get(url) | |
# 网页使用gb2312编码,beautifulSoup只支持Unicode编码,需要转换 | |
# Fix: 跳过无法解码内容 | |
soup = BeautifulSoup(r.content.decode('gbk', 'ignore')) | |
pg_num = soup.find("font", color="FF0000").text | |
print("title: %s,\npage: %s" % (soup.title.text, pg_num)) | |
except Exception as e: | |
print(e) | |
print(" error, retry after 1 sec...") | |
time.sleep(1) | |
continue | |
# 跳过已下载目录 | |
if not os.path.exists(os.path.join(os.getcwd(), pg_num)): | |
# 查找电子书下载页面链接 | |
# TODO: 这里处理的有点简单,后续待改进 | |
for page in soup.find_all(attrs={"align": "left", "width": "284"}): | |
url = urlparse.urljoin(url_base, page.a['href']) | |
print("download : %s" % url) | |
parsepage(url) | |
# 对当前页归档 | |
os.system("mkdir \"%s\"" % pg_num) | |
os.system("move *.rar \"%s\"" % pg_num) | |
else: | |
print("existing folder %s,skip..." % pg_num) | |
# 查找下一页 | |
next_page = soup.find("a", text=u"下一页") | |
if next_page is None: | |
print("Done!") | |
break | |
url = urlparse.urljoin(url_base, next_page['href']) | |
# 解析下载页面 | |
def parsepage(url): | |
r = requests.get(url) | |
# Fix:跳过无法解码内容 | |
s = BeautifulSoup(r.content.decode('gbk', 'ignore')) | |
print("download book: %s" % s.title.text) | |
link = s.find("a", text=u"下载地址1") | |
pdf = urlparse.urljoin(url_base, link['href']) | |
print("url: %s" % pdf) | |
downloadfile(pdf) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment