luanvuhlu · November 24, 2017 09:06
diff --git a/isach.py b/isach.py
 import urllib2
 from bs4 import BeautifulSoup


 BASE_URL = 'http://isach.info/mobile/story.php?story=tru_tien_2__tieu_dinh&chapter='
 CHAPTER_START = 2


 def main():
 	chapters = get_chapters_arr()
 	book_content = get_book_content(chapters)
 	write_file("test", book_content)


 def write_file(name, content):
 	text_file = open("%s.html" % name, "w")
 	text_file.write(content.encode('utf8'))
 	text_file.close()


 def get_book_content(chapters):
 	book_content_arr = ["<html>", "<body>"]
 	for chapter in chapters:
 		book_content_arr.append(chapter)
 	book_content_arr.extend(["</html>", "</body>"])
 	return ''.join(book_content_arr)


 def get_chapters_arr():
 	chapters = []
 	for chapter_index in range(2, 133):
 		chapters.append(get_chapter(chapter_index))
 	return chapters


 def get_chapter(chapter_index):
 	url = get_url(chapter_index)
 	page = urllib2.urlopen(url).read()
 	soup = BeautifulSoup(page, "html.parser")
 	chapter_name = soup.find(class_='ms_chapter')
 	chapter_contents = ''.join([content.prettify() for content in soup.find_all(class_='ms_text')])
 	chapter = chapter_name.prettify()+chapter_contents
 	print type(chapter_name)  # TODO
 	return chapter


 def get_url(chapter_index):
 	return BASE_URL+'%04d' % chapter_index


 if __name__ == '__main__':
 	main()
diff --git a/truyenfull.py b/truyenfull.py
 from pyquery import PyQuery as pq
 from lxml import etree
 import urllib2

 def next_chapter_list_page(chapters_list_page_pq):
 	current_li=chapters_list_page_pq('#list-chapter ul.pagination li.active')
 	next_li = current_li.next()
 	a = next_li.find('a')
 	return a.attr.href
 def parse_list_chapters(chapters_list_page_pq):
 	for a in chapters_list_page_pq('#list-chapter ul.list-chapter li a'):
 		yield a.get('href')
 def parse_chapter_content(chapter_pq):
 	name_text = chapter_pq('.container.chapter a.chapter-title').text()
 	name_tag = "<p class='chapter-name'><h2>%s</h2></p>" % name_text
 	content = chapter_pq('.container.chapter .chapter-c').html()
 	return {'name': name_tag, 'content': content}
 def has_next_chapters_list_page(chapters_list_page_pq):
 	return chapters_list_page_pq('#list-chapter ul.pagination li a span.glyphicon-menu-right')
 def write_file(name, content):
 	text_file = open(name, "w")
 	text_file.write(content.encode('utf8'))
 	text_file.close()
 def get_book_content(chapters):
 	book_content_arr = ["<html>", "<body>"]
 	for chapter in chapters:
 		book_content_arr.append(chapter['name']+chapter['content'])
 	book_content_arr.extend(["</html>", "</body>"])
 	return ''.join(book_content_arr)
 def get_source(url):
 	try:
 		print url
 		return urllib2.urlopen(url).read()
 	except Exception as inst:
 		print inst
 def parse(url):
 	chapters = []
 	while True and url:
 		source = get_source(url)
 		source_pq = pq(source)
 		for chapter_url in parse_list_chapters(source_pq):
 			chapter_pq = pq(get_source(chapter_url))
 			chapter = parse_chapter_content(chapter_pq)
 			chapters.append(chapter)
 		if has_next_chapters_list_page(source_pq):
 			url = next_chapter_list_page(source_pq)
 		else:
 			break
 	return chapters

 def main():
 	url = 'http://truyenfull.vn/luc-tien/'
 	chapters = parse(url)
 	book_content = get_book_content(chapters)
 	write_file("luc tien.html", book_content)
 	print "DONE"
 if __name__ == '__main__':
 	main()
	import urllib2
	from bs4 import BeautifulSoup


	BASE_URL = 'http://isach.info/mobile/story.php?story=tru_tien_2__tieu_dinh&chapter='
	CHAPTER_START = 2


	def main():
	chapters = get_chapters_arr()
	book_content = get_book_content(chapters)
	write_file("test", book_content)


	def write_file(name, content):
	text_file = open("%s.html" % name, "w")
	text_file.write(content.encode('utf8'))
	text_file.close()


	def get_book_content(chapters):
	book_content_arr = ["<html>", "<body>"]
	for chapter in chapters:
	book_content_arr.append(chapter)
	book_content_arr.extend(["</html>", "</body>"])
	return ''.join(book_content_arr)


	def get_chapters_arr():
	chapters = []
	for chapter_index in range(2, 133):
	chapters.append(get_chapter(chapter_index))
	return chapters


	def get_chapter(chapter_index):
	url = get_url(chapter_index)
	page = urllib2.urlopen(url).read()
	soup = BeautifulSoup(page, "html.parser")
	chapter_name = soup.find(class_='ms_chapter')
	chapter_contents = ''.join([content.prettify() for content in soup.find_all(class_='ms_text')])
	chapter = chapter_name.prettify()+chapter_contents
	print type(chapter_name) # TODO
	return chapter


	def get_url(chapter_index):
	return BASE_URL+'%04d' % chapter_index


	if __name__ == '__main__':
	main()
	from pyquery import PyQuery as pq
	from lxml import etree
	import urllib2

	def next_chapter_list_page(chapters_list_page_pq):
	current_li=chapters_list_page_pq('#list-chapter ul.pagination li.active')
	next_li = current_li.next()
	a = next_li.find('a')
	return a.attr.href
	def parse_list_chapters(chapters_list_page_pq):
	for a in chapters_list_page_pq('#list-chapter ul.list-chapter li a'):
	yield a.get('href')
	def parse_chapter_content(chapter_pq):
	name_text = chapter_pq('.container.chapter a.chapter-title').text()
	name_tag = "<p class='chapter-name'><h2>%s</h2></p>" % name_text
	content = chapter_pq('.container.chapter .chapter-c').html()
	return {'name': name_tag, 'content': content}
	def has_next_chapters_list_page(chapters_list_page_pq):
	return chapters_list_page_pq('#list-chapter ul.pagination li a span.glyphicon-menu-right')
	def write_file(name, content):
	text_file = open(name, "w")
	text_file.write(content.encode('utf8'))
	text_file.close()
	def get_book_content(chapters):
	book_content_arr = ["<html>", "<body>"]
	for chapter in chapters:
	book_content_arr.append(chapter['name']+chapter['content'])
	book_content_arr.extend(["</html>", "</body>"])
	return ''.join(book_content_arr)
	def get_source(url):
	try:
	print url
	return urllib2.urlopen(url).read()
	except Exception as inst:
	print inst
	def parse(url):
	chapters = []
	while True and url:
	source = get_source(url)
	source_pq = pq(source)
	for chapter_url in parse_list_chapters(source_pq):
	chapter_pq = pq(get_source(chapter_url))
	chapter = parse_chapter_content(chapter_pq)
	chapters.append(chapter)
	if has_next_chapters_list_page(source_pq):
	url = next_chapter_list_page(source_pq)
	else:
	break
	return chapters

	def main():
	url = 'http://truyenfull.vn/luc-tien/'
	chapters = parse(url)
	book_content = get_book_content(chapters)
	write_file("luc tien.html", book_content)
	print "DONE"
	if __name__ == '__main__':
	main()