Skip to content

Instantly share code, notes, and snippets.

@mutaku
Created October 7, 2012 22:25
Show Gist options
  • Save mutaku/3849804 to your computer and use it in GitHub Desktop.
Save mutaku/3849804 to your computer and use it in GitHub Desktop.
build monolithic book file from multiple html pages
##########################
# Import multi-page HTML
# books and write to file
##########################
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import itertools
import bleach
from urlparse import urlparse
import sys
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email import Encoders
import smtplib
PARSER = {
'19lou': {
'encoding': 'gbk',
'elements': [
{'element': 'div',
'attr': 'class',
'attr_value': 'article-bd'},
{'element': 'p',
'attr': 'class',
'attr_value': 'f14'}]},
'mama': {
'encoding': 'gbk',
'elements': [
{'element': 'div',
'attr': 'class',
'attr_value': 'f14'}]}}
def email_book(obj,
to_addr=None,
from_addr=None,
server=None,
user=None,
password=None):
'''
Emails book file to kindle
'''
msg = MIMEMultipart()
msg['Subject'] = obj.book_name
msg['From'] = from_addr
msg['To'] = to_addr
part = MIMEBase('application', "octet-stream")
part.set_payload(open(obj.book_name, "rb").read())
Encoders.encode_base64(part)
part.add_header('Content-Disposition',
'attachment; filename="%s"' % obj.book_name)
msg.attach(part)
server = smtplib.SMTP(server)
server.ehlo()
server.login(user, password)
server.sendmail(from_addr,
to_addr,
msg.as_string())
server.close()
def printer(data):
"""
Prints data to stdout on one line dynamically
"""
sys.stdout.write("\r\x1b[K"+data.__str__())
sys.stdout.flush()
class Book():
'''
Combines multiple HTML pages into one book
text file
'''
def __init__(self, url, encoding="gbk", cleanup=True):
'''
Sets up the initial data
'''
self.url = self.clean_url(url)
self.cleanup = cleanup
self.domain = self.get_domain()
self.encoding = PARSER[self.domain]['encoding']
self.book_name = self.get_book_name()
self.max_page = self.get_max_page()
self.elements = PARSER[self.domain]['elements']
def get_domain(self):
'''
Determines source domain to set parsing methods
'''
parsed_url = urlparse(self.url)
if "19lou.com" in parsed_url.netloc:
return "19lou"
elif "mama.cn" in parsed_url.netloc:
return "mama"
return None
def clean_url(self, url):
'''
Cleans up a url
'''
if not url.startswith("http://"):
url = "http://" + url
return re.sub(r'\&page\=.?', '', url)
def gimme_text(self):
'''
Uses BeautifulSoup to get the actual book content
'''
data = self.parse_page()
content_list = list()
for element in self.elements:
content = data.findAll(element['element'],
{element['attr']: element['attr_value']})
if self.cleanup:
for k, val in enumerate(content):
text = val.renderContents()
text = text.replace("<br />", "\n")
text = text.replace("<br>", "\n")
content_list.append(
bleach.clean(
text,
strip=True))
else:
content_list.append(content)
return content_list
def gimme_data(self):
'''
Iterates pages based on URL
'''
return urllib2.urlopen(self.curr_url).read()
def parse_page(self):
'''
Sets up BeautifulSoup parse
'''
return BeautifulSoup(self.gimme_data(),
fromEncoding=self.encoding)
def get_max_page(self):
'''
Finds the max page from paginator links as number
'''
self.curr_url = self.url
page_string = self.parse_page().find("option").contents[0]
return int(page_string.split("/")[1])
def enumerate_pages(self):
'''
Enumerates book pages and gets content
'''
text_list = list()
for page in range(1, self.max_page + 1):
self.curr_url = self.url + "&page=%d" % page
printer("Getting %s" % self.curr_url)
text_list.append(self.gimme_text())
return list(itertools.chain(*text_list))
def get_book_name(self):
'''
Grabs book name from TID part of query string
'''
result = re.search(r'\&tid\=(?P<book>.\d*)', self.url)
return result.group('book') + '.txt'
def write_file(self):
'''
Iterates over text data and writes to file
'''
f = open(self.book_name, 'w')
for element in self.enumerate_pages():
f.write(unicode(element).encode('utf8', "replace"))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment