import grabber
b = grabber.Book(link)
b.write_file()
grabber.email_book(b,
"[email protected]",
"[email protected]",
"smtp.mymailserver.com",
"[email protected]",
"mypassword")
Created
October 7, 2012 22:25
-
-
Save mutaku/3849804 to your computer and use it in GitHub Desktop.
build monolithic book file from multiple html pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################## | |
# Import multi-page HTML | |
# books and write to file | |
########################## | |
from BeautifulSoup import BeautifulSoup | |
import urllib2 | |
import re | |
import itertools | |
import bleach | |
from urlparse import urlparse | |
import sys | |
from email.MIMEMultipart import MIMEMultipart | |
from email.MIMEBase import MIMEBase | |
from email import Encoders | |
import smtplib | |
PARSER = { | |
'19lou': { | |
'encoding': 'gbk', | |
'elements': [ | |
{'element': 'div', | |
'attr': 'class', | |
'attr_value': 'article-bd'}, | |
{'element': 'p', | |
'attr': 'class', | |
'attr_value': 'f14'}]}, | |
'mama': { | |
'encoding': 'gbk', | |
'elements': [ | |
{'element': 'div', | |
'attr': 'class', | |
'attr_value': 'f14'}]}} | |
def email_book(obj, | |
to_addr=None, | |
from_addr=None, | |
server=None, | |
user=None, | |
password=None): | |
''' | |
Emails book file to kindle | |
''' | |
msg = MIMEMultipart() | |
msg['Subject'] = obj.book_name | |
msg['From'] = from_addr | |
msg['To'] = to_addr | |
part = MIMEBase('application', "octet-stream") | |
part.set_payload(open(obj.book_name, "rb").read()) | |
Encoders.encode_base64(part) | |
part.add_header('Content-Disposition', | |
'attachment; filename="%s"' % obj.book_name) | |
msg.attach(part) | |
server = smtplib.SMTP(server) | |
server.ehlo() | |
server.login(user, password) | |
server.sendmail(from_addr, | |
to_addr, | |
msg.as_string()) | |
server.close() | |
def printer(data): | |
""" | |
Prints data to stdout on one line dynamically | |
""" | |
sys.stdout.write("\r\x1b[K"+data.__str__()) | |
sys.stdout.flush() | |
class Book(): | |
''' | |
Combines multiple HTML pages into one book | |
text file | |
''' | |
def __init__(self, url, encoding="gbk", cleanup=True): | |
''' | |
Sets up the initial data | |
''' | |
self.url = self.clean_url(url) | |
self.cleanup = cleanup | |
self.domain = self.get_domain() | |
self.encoding = PARSER[self.domain]['encoding'] | |
self.book_name = self.get_book_name() | |
self.max_page = self.get_max_page() | |
self.elements = PARSER[self.domain]['elements'] | |
def get_domain(self): | |
''' | |
Determines source domain to set parsing methods | |
''' | |
parsed_url = urlparse(self.url) | |
if "19lou.com" in parsed_url.netloc: | |
return "19lou" | |
elif "mama.cn" in parsed_url.netloc: | |
return "mama" | |
return None | |
def clean_url(self, url): | |
''' | |
Cleans up a url | |
''' | |
if not url.startswith("http://"): | |
url = "http://" + url | |
return re.sub(r'\&page\=.?', '', url) | |
def gimme_text(self): | |
''' | |
Uses BeautifulSoup to get the actual book content | |
''' | |
data = self.parse_page() | |
content_list = list() | |
for element in self.elements: | |
content = data.findAll(element['element'], | |
{element['attr']: element['attr_value']}) | |
if self.cleanup: | |
for k, val in enumerate(content): | |
text = val.renderContents() | |
text = text.replace("<br />", "\n") | |
text = text.replace("<br>", "\n") | |
content_list.append( | |
bleach.clean( | |
text, | |
strip=True)) | |
else: | |
content_list.append(content) | |
return content_list | |
def gimme_data(self): | |
''' | |
Iterates pages based on URL | |
''' | |
return urllib2.urlopen(self.curr_url).read() | |
def parse_page(self): | |
''' | |
Sets up BeautifulSoup parse | |
''' | |
return BeautifulSoup(self.gimme_data(), | |
fromEncoding=self.encoding) | |
def get_max_page(self): | |
''' | |
Finds the max page from paginator links as number | |
''' | |
self.curr_url = self.url | |
page_string = self.parse_page().find("option").contents[0] | |
return int(page_string.split("/")[1]) | |
def enumerate_pages(self): | |
''' | |
Enumerates book pages and gets content | |
''' | |
text_list = list() | |
for page in range(1, self.max_page + 1): | |
self.curr_url = self.url + "&page=%d" % page | |
printer("Getting %s" % self.curr_url) | |
text_list.append(self.gimme_text()) | |
return list(itertools.chain(*text_list)) | |
def get_book_name(self): | |
''' | |
Grabs book name from TID part of query string | |
''' | |
result = re.search(r'\&tid\=(?P<book>.\d*)', self.url) | |
return result.group('book') + '.txt' | |
def write_file(self): | |
''' | |
Iterates over text data and writes to file | |
''' | |
f = open(self.book_name, 'w') | |
for element in self.enumerate_pages(): | |
f.write(unicode(element).encode('utf8', "replace")) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment