Created
November 24, 2017 09:06
-
-
Save luanvuhlu/a34ca411b3e738c36e2274f20522093d to your computer and use it in GitHub Desktop.
Download books
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
from bs4 import BeautifulSoup | |
BASE_URL = 'http://isach.info/mobile/story.php?story=tru_tien_2__tieu_dinh&chapter=' | |
CHAPTER_START = 2 | |
def main(): | |
chapters = get_chapters_arr() | |
book_content = get_book_content(chapters) | |
write_file("test", book_content) | |
def write_file(name, content): | |
text_file = open("%s.html" % name, "w") | |
text_file.write(content.encode('utf8')) | |
text_file.close() | |
def get_book_content(chapters): | |
book_content_arr = ["<html>", "<body>"] | |
for chapter in chapters: | |
book_content_arr.append(chapter) | |
book_content_arr.extend(["</html>", "</body>"]) | |
return ''.join(book_content_arr) | |
def get_chapters_arr(): | |
chapters = [] | |
for chapter_index in range(2, 133): | |
chapters.append(get_chapter(chapter_index)) | |
return chapters | |
def get_chapter(chapter_index): | |
url = get_url(chapter_index) | |
page = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(page, "html.parser") | |
chapter_name = soup.find(class_='ms_chapter') | |
chapter_contents = ''.join([content.prettify() for content in soup.find_all(class_='ms_text')]) | |
chapter = chapter_name.prettify()+chapter_contents | |
print type(chapter_name) # TODO | |
return chapter | |
def get_url(chapter_index): | |
return BASE_URL+'%04d' % chapter_index | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyquery import PyQuery as pq | |
from lxml import etree | |
import urllib2 | |
def next_chapter_list_page(chapters_list_page_pq): | |
current_li=chapters_list_page_pq('#list-chapter ul.pagination li.active') | |
next_li = current_li.next() | |
a = next_li.find('a') | |
return a.attr.href | |
def parse_list_chapters(chapters_list_page_pq): | |
for a in chapters_list_page_pq('#list-chapter ul.list-chapter li a'): | |
yield a.get('href') | |
def parse_chapter_content(chapter_pq): | |
name_text = chapter_pq('.container.chapter a.chapter-title').text() | |
name_tag = "<p class='chapter-name'><h2>%s</h2></p>" % name_text | |
content = chapter_pq('.container.chapter .chapter-c').html() | |
return {'name': name_tag, 'content': content} | |
def has_next_chapters_list_page(chapters_list_page_pq): | |
return chapters_list_page_pq('#list-chapter ul.pagination li a span.glyphicon-menu-right') | |
def write_file(name, content): | |
text_file = open(name, "w") | |
text_file.write(content.encode('utf8')) | |
text_file.close() | |
def get_book_content(chapters): | |
book_content_arr = ["<html>", "<body>"] | |
for chapter in chapters: | |
book_content_arr.append(chapter['name']+chapter['content']) | |
book_content_arr.extend(["</html>", "</body>"]) | |
return ''.join(book_content_arr) | |
def get_source(url): | |
try: | |
print url | |
return urllib2.urlopen(url).read() | |
except Exception as inst: | |
print inst | |
def parse(url): | |
chapters = [] | |
while True and url: | |
source = get_source(url) | |
source_pq = pq(source) | |
for chapter_url in parse_list_chapters(source_pq): | |
chapter_pq = pq(get_source(chapter_url)) | |
chapter = parse_chapter_content(chapter_pq) | |
chapters.append(chapter) | |
if has_next_chapters_list_page(source_pq): | |
url = next_chapter_list_page(source_pq) | |
else: | |
break | |
return chapters | |
def main(): | |
url = 'http://truyenfull.vn/luc-tien/' | |
chapters = parse(url) | |
book_content = get_book_content(chapters) | |
write_file("luc tien.html", book_content) | |
print "DONE" | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment