tonyseek · December 19, 2015 14:58
diff --git a/deliver-solidot.sh b/deliver-solidot.sh
 #!/usr/bin/env bash

 deliver_target=([email protected] [email protected])

 smtp_account="[email protected]"
 smpt_password="******"

 issue_archive_dir="/srv/ebook-deliver/archives"
 issue_name="solidot-$(date +%Y%m%d)"

 current_dir=$(dirname $0)

 ebook-convert $current_dir/solidot.recipe \
              $issue_archive_dir/$issue_name.mobi \
              --authors="Solidot" 2>&1 >> $issue_archive_dir/$issue_name.log

 for target_email in $deliver_target
 do
    calibre-smtp --attachment $issue_archive_dir/$issue_name.mobi \
                 --relay smtp.gmail.com --port 587 \
                 --username $smtp_account \
                 --password $smpt_password \
                 --subject "$issue_name for $target_email" \
                 --encryption-method TLS $smtp_account $target_email \
                 "$issue_name"
 done
diff --git a/solidot.recipe b/solidot.recipe
 #-*- coding:utf-8 -*-

 import re
 import datetime
 import urlparse

 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString


 find_strings = lambda contents: [content for content in contents
                                 if isinstance(content, NavigableString)]
 strip_tags = lambda soup: "".join([e for e in soup.recursiveChildGenerator()
                                   if isinstance(e, unicode)])
 yesterday = lambda: datetime.date.today() - datetime.timedelta(days=1)


 class SolidotRecipe(BasicNewsRecipe):

    title = u"Solidot"
    description = u"奇客的资讯，重要的东西"
    language = "zh"
    publication_type = "newspaper"

    no_stylesheets = True
    remove_javascript = True

    def parse_index(self):
        return [(cat_name, list(self.fetch_index(cat_url)))
                for cat_name, cat_url in self.fetch_categories()]

    def preprocess_html(self, soup):
        return self.parse_article(soup)

    # ---------------------
    # custom implementation
    # ---------------------

    base_url = "http://www.solidot.org/"

    def fetch_categories(self):
        soup = self.index_to_soup(self.base_url)
        items = soup.find("ul", {"class": "left_ul"}).findAll("li")
        for li in items[1:]:
            anchor = li.find("a")
            yield anchor.string, anchor["href"]

    def fetch_index(self, cat_url):
        # current category is home or not
        is_home = cat_url.strip("/") == self.base_url.strip("/")

        # do not fetch one article two times
        if not hasattr(self, "solidot_fetching_histories"):
            self.solidot_fetching_histories = set()
        histories = self.solidot_fetching_histories

        # limit by issue date (yesterday)
        cat_url = "%s?issue=%s" % (cat_url, yesterday().strftime("%Y%m%d"))
        soup = self.index_to_soup(cat_url)

        center_block = soup.find("div", {"id": "center"})
        index_blocks = center_block.findAll("div", {"class": "block_m"})

        for index_block in index_blocks:
            headline = index_block.find("h2").find("a", recursive=False)
            talktime = index_block.find("div", {"class": "talk_time"})
            pcontent = index_block.find("div", {"class": "p_content"})

            time_string = [s for s in find_strings(talktime.contents)
                           if s.strip()][0]
            time = self.parse_time(time_string)

            url = urlparse.urljoin(self.base_url, headline["href"])
            description = strip_tags(pcontent).strip()

            # the home should include non-tagged articles only
            if is_home and index_block.find("h2").find("a") is not headline:
                continue

            item = {"title": headline.string,
                    "url": url,
                    "description": description,
                    "date": unicode(time)}

            if item["url"] not in histories:
                histories.add(item["url"])
                yield item

    #: the regex pattern to parse time
    re_parse_time = re.compile(r"(?P<year>\d{4})\w"
                               r"(?P<month>[01][0-9])\w"
                               r"(?P<day>[0-3][0-9])\w"
                               r"\s*"
                               r"(?P<hour>[0-2]\d)\w"
                               r"(?P<minute>[0-6]\d)\w",
                               flags=re.UNICODE)

    def parse_time(self, time_string):
        match = self.re_parse_time.search(time_string.strip())
        if not match:
            raise ValueError(time_string)
        time_fields = {name: int(match.group(name).encode("ascii"))
                       for name in ("year", "month", "day", "hour", "minute")}
        return datetime.datetime(**time_fields)

    def parse_article(self, old_soup):
        new_soup = BeautifulSoup()

        # merge old content
        new_soup.insert(0, old_soup.find("div", {"class": "block_m"}))

        # extract and parse time
        talktime = new_soup.find("div", {"class": "talk_time"})
        for child in talktime.findChildren():
            child.extract()
        time = self.parse_time("".join(unicode(c) for c in talktime.contents))

        # create cleaned time tag
        new_talktime = Tag(new_soup, "p", [
            ("class", "talk_time"),
            ("style", "line-height: 2em; color: #999")
        ])
        new_talktime.insert(0, NavigableString(u"发布于 %s" % time))
        talktime.replaceWith(new_talktime)

        # clean useless tag
        new_soup.find("div", {"class": "mid_bgtittle"}).extract()

        # clean anchors
        for old_anchor in new_soup.findAll("a"):
            new_anchor = Tag(new_soup, "a", [("href", old_anchor["href"])])
            new_anchor.insert(0, NavigableString(strip_tags(old_anchor)))
            old_anchor.replaceWith(new_anchor)

        return new_soup

 # vim:set ft=python:
	#!/usr/bin/env bash

	deliver_target=([email protected] [email protected])

	smtp_account="[email protected]"
	smpt_password="******"

	issue_archive_dir="/srv/ebook-deliver/archives"
	issue_name="solidot-$(date +%Y%m%d)"

	current_dir=$(dirname $0)

	ebook-convert $current_dir/solidot.recipe \
	$issue_archive_dir/$issue_name.mobi \
	--authors="Solidot" 2>&1 >> $issue_archive_dir/$issue_name.log

	for target_email in $deliver_target
	do
	calibre-smtp --attachment $issue_archive_dir/$issue_name.mobi \
	--relay smtp.gmail.com --port 587 \
	--username $smtp_account \
	--password $smpt_password \
	--subject "$issue_name for $target_email" \
	--encryption-method TLS $smtp_account $target_email \
	"$issue_name"
	done
	#-- coding:utf-8 --

	import re
	import datetime
	import urlparse

	from calibre.web.feeds.news import BasicNewsRecipe
	from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString


	find_strings = lambda contents: [content for content in contents
	if isinstance(content, NavigableString)]
	strip_tags = lambda soup: "".join([e for e in soup.recursiveChildGenerator()
	if isinstance(e, unicode)])
	yesterday = lambda: datetime.date.today() - datetime.timedelta(days=1)


	class SolidotRecipe(BasicNewsRecipe):

	title = u"Solidot"
	description = u"奇客的资讯，重要的东西"
	language = "zh"
	publication_type = "newspaper"

	no_stylesheets = True
	remove_javascript = True

	def parse_index(self):
	return [(cat_name, list(self.fetch_index(cat_url)))
	for cat_name, cat_url in self.fetch_categories()]

	def preprocess_html(self, soup):
	return self.parse_article(soup)

	# ---------------------
	# custom implementation
	# ---------------------

	base_url = "http://www.solidot.org/"

	def fetch_categories(self):
	soup = self.index_to_soup(self.base_url)
	items = soup.find("ul", {"class": "left_ul"}).findAll("li")
	for li in items[1:]:
	anchor = li.find("a")
	yield anchor.string, anchor["href"]

	def fetch_index(self, cat_url):
	# current category is home or not
	is_home = cat_url.strip("/") == self.base_url.strip("/")

	# do not fetch one article two times
	if not hasattr(self, "solidot_fetching_histories"):
	self.solidot_fetching_histories = set()
	histories = self.solidot_fetching_histories

	# limit by issue date (yesterday)
	cat_url = "%s?issue=%s" % (cat_url, yesterday().strftime("%Y%m%d"))
	soup = self.index_to_soup(cat_url)

	center_block = soup.find("div", {"id": "center"})
	index_blocks = center_block.findAll("div", {"class": "block_m"})

	for index_block in index_blocks:
	headline = index_block.find("h2").find("a", recursive=False)
	talktime = index_block.find("div", {"class": "talk_time"})
	pcontent = index_block.find("div", {"class": "p_content"})

	time_string = [s for s in find_strings(talktime.contents)
	if s.strip()][0]
	time = self.parse_time(time_string)

	url = urlparse.urljoin(self.base_url, headline["href"])
	description = strip_tags(pcontent).strip()

	# the home should include non-tagged articles only
	if is_home and index_block.find("h2").find("a") is not headline:
	continue

	item = {"title": headline.string,
	"url": url,
	"description": description,
	"date": unicode(time)}

	if item["url"] not in histories:
	histories.add(item["url"])
	yield item

	#: the regex pattern to parse time
	re_parse_time = re.compile(r"(?P<year>\d{4})\w"
	r"(?P<month>[01][0-9])\w"
	r"(?P<day>[0-3][0-9])\w"
	r"\s*"
	r"(?P<hour>[0-2]\d)\w"
	r"(?P<minute>[0-6]\d)\w",
	flags=re.UNICODE)

	def parse_time(self, time_string):
	match = self.re_parse_time.search(time_string.strip())
	if not match:
	raise ValueError(time_string)
	time_fields = {name: int(match.group(name).encode("ascii"))
	for name in ("year", "month", "day", "hour", "minute")}
	return datetime.datetime(**time_fields)

	def parse_article(self, old_soup):
	new_soup = BeautifulSoup()

	# merge old content
	new_soup.insert(0, old_soup.find("div", {"class": "block_m"}))

	# extract and parse time
	talktime = new_soup.find("div", {"class": "talk_time"})
	for child in talktime.findChildren():
	child.extract()
	time = self.parse_time("".join(unicode(c) for c in talktime.contents))

	# create cleaned time tag
	new_talktime = Tag(new_soup, "p", [
	("class", "talk_time"),
	("style", "line-height: 2em; color: #999")
	])
	new_talktime.insert(0, NavigableString(u"发布于 %s" % time))
	talktime.replaceWith(new_talktime)

	# clean useless tag
	new_soup.find("div", {"class": "mid_bgtittle"}).extract()

	# clean anchors
	for old_anchor in new_soup.findAll("a"):
	new_anchor = Tag(new_soup, "a", [("href", old_anchor["href"])])
	new_anchor.insert(0, NavigableString(strip_tags(old_anchor)))
	old_anchor.replaceWith(new_anchor)

	return new_soup

	# vim:set ft=python: