Last active
December 19, 2015 14:58
-
-
Save tonyseek/5972677 to your computer and use it in GitHub Desktop.
Solidot 日报
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
deliver_target=([email protected] [email protected]) | |
smtp_account="[email protected]" | |
smpt_password="******" | |
issue_archive_dir="/srv/ebook-deliver/archives" | |
issue_name="solidot-$(date +%Y%m%d)" | |
current_dir=$(dirname $0) | |
ebook-convert $current_dir/solidot.recipe \ | |
$issue_archive_dir/$issue_name.mobi \ | |
--authors="Solidot" 2>&1 >> $issue_archive_dir/$issue_name.log | |
for target_email in $deliver_target | |
do | |
calibre-smtp --attachment $issue_archive_dir/$issue_name.mobi \ | |
--relay smtp.gmail.com --port 587 \ | |
--username $smtp_account \ | |
--password $smpt_password \ | |
--subject "$issue_name for $target_email" \ | |
--encryption-method TLS $smtp_account $target_email \ | |
"$issue_name" | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
import re | |
import datetime | |
import urlparse | |
from calibre.web.feeds.news import BasicNewsRecipe | |
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString | |
find_strings = lambda contents: [content for content in contents | |
if isinstance(content, NavigableString)] | |
strip_tags = lambda soup: "".join([e for e in soup.recursiveChildGenerator() | |
if isinstance(e, unicode)]) | |
yesterday = lambda: datetime.date.today() - datetime.timedelta(days=1) | |
class SolidotRecipe(BasicNewsRecipe): | |
title = u"Solidot" | |
description = u"奇客的资讯,重要的东西" | |
language = "zh" | |
publication_type = "newspaper" | |
no_stylesheets = True | |
remove_javascript = True | |
def parse_index(self): | |
return [(cat_name, list(self.fetch_index(cat_url))) | |
for cat_name, cat_url in self.fetch_categories()] | |
def preprocess_html(self, soup): | |
return self.parse_article(soup) | |
# --------------------- | |
# custom implementation | |
# --------------------- | |
base_url = "http://www.solidot.org/" | |
def fetch_categories(self): | |
soup = self.index_to_soup(self.base_url) | |
items = soup.find("ul", {"class": "left_ul"}).findAll("li") | |
for li in items[1:]: | |
anchor = li.find("a") | |
yield anchor.string, anchor["href"] | |
def fetch_index(self, cat_url): | |
# current category is home or not | |
is_home = cat_url.strip("/") == self.base_url.strip("/") | |
# do not fetch one article two times | |
if not hasattr(self, "solidot_fetching_histories"): | |
self.solidot_fetching_histories = set() | |
histories = self.solidot_fetching_histories | |
# limit by issue date (yesterday) | |
cat_url = "%s?issue=%s" % (cat_url, yesterday().strftime("%Y%m%d")) | |
soup = self.index_to_soup(cat_url) | |
center_block = soup.find("div", {"id": "center"}) | |
index_blocks = center_block.findAll("div", {"class": "block_m"}) | |
for index_block in index_blocks: | |
headline = index_block.find("h2").find("a", recursive=False) | |
talktime = index_block.find("div", {"class": "talk_time"}) | |
pcontent = index_block.find("div", {"class": "p_content"}) | |
time_string = [s for s in find_strings(talktime.contents) | |
if s.strip()][0] | |
time = self.parse_time(time_string) | |
url = urlparse.urljoin(self.base_url, headline["href"]) | |
description = strip_tags(pcontent).strip() | |
# the home should include non-tagged articles only | |
if is_home and index_block.find("h2").find("a") is not headline: | |
continue | |
item = {"title": headline.string, | |
"url": url, | |
"description": description, | |
"date": unicode(time)} | |
if item["url"] not in histories: | |
histories.add(item["url"]) | |
yield item | |
#: the regex pattern to parse time | |
re_parse_time = re.compile(r"(?P<year>\d{4})\w" | |
r"(?P<month>[01][0-9])\w" | |
r"(?P<day>[0-3][0-9])\w" | |
r"\s*" | |
r"(?P<hour>[0-2]\d)\w" | |
r"(?P<minute>[0-6]\d)\w", | |
flags=re.UNICODE) | |
def parse_time(self, time_string): | |
match = self.re_parse_time.search(time_string.strip()) | |
if not match: | |
raise ValueError(time_string) | |
time_fields = {name: int(match.group(name).encode("ascii")) | |
for name in ("year", "month", "day", "hour", "minute")} | |
return datetime.datetime(**time_fields) | |
def parse_article(self, old_soup): | |
new_soup = BeautifulSoup() | |
# merge old content | |
new_soup.insert(0, old_soup.find("div", {"class": "block_m"})) | |
# extract and parse time | |
talktime = new_soup.find("div", {"class": "talk_time"}) | |
for child in talktime.findChildren(): | |
child.extract() | |
time = self.parse_time("".join(unicode(c) for c in talktime.contents)) | |
# create cleaned time tag | |
new_talktime = Tag(new_soup, "p", [ | |
("class", "talk_time"), | |
("style", "line-height: 2em; color: #999") | |
]) | |
new_talktime.insert(0, NavigableString(u"发布于 %s" % time)) | |
talktime.replaceWith(new_talktime) | |
# clean useless tag | |
new_soup.find("div", {"class": "mid_bgtittle"}).extract() | |
# clean anchors | |
for old_anchor in new_soup.findAll("a"): | |
new_anchor = Tag(new_soup, "a", [("href", old_anchor["href"])]) | |
new_anchor.insert(0, NavigableString(strip_tags(old_anchor))) | |
old_anchor.replaceWith(new_anchor) | |
return new_soup | |
# vim:set ft=python: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment