Skip to content

Instantly share code, notes, and snippets.

@tonyseek
Last active December 19, 2015 14:58
Show Gist options
  • Save tonyseek/5972677 to your computer and use it in GitHub Desktop.
Save tonyseek/5972677 to your computer and use it in GitHub Desktop.
Solidot 日报
#!/usr/bin/env bash
deliver_target=([email protected] [email protected])
smtp_account="[email protected]"
smpt_password="******"
issue_archive_dir="/srv/ebook-deliver/archives"
issue_name="solidot-$(date +%Y%m%d)"
current_dir=$(dirname $0)
ebook-convert $current_dir/solidot.recipe \
$issue_archive_dir/$issue_name.mobi \
--authors="Solidot" 2>&1 >> $issue_archive_dir/$issue_name.log
for target_email in $deliver_target
do
calibre-smtp --attachment $issue_archive_dir/$issue_name.mobi \
--relay smtp.gmail.com --port 587 \
--username $smtp_account \
--password $smpt_password \
--subject "$issue_name for $target_email" \
--encryption-method TLS $smtp_account $target_email \
"$issue_name"
done
#-*- coding:utf-8 -*-
import re
import datetime
import urlparse
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
find_strings = lambda contents: [content for content in contents
if isinstance(content, NavigableString)]
strip_tags = lambda soup: "".join([e for e in soup.recursiveChildGenerator()
if isinstance(e, unicode)])
yesterday = lambda: datetime.date.today() - datetime.timedelta(days=1)
class SolidotRecipe(BasicNewsRecipe):
title = u"Solidot"
description = u"奇客的资讯,重要的东西"
language = "zh"
publication_type = "newspaper"
no_stylesheets = True
remove_javascript = True
def parse_index(self):
return [(cat_name, list(self.fetch_index(cat_url)))
for cat_name, cat_url in self.fetch_categories()]
def preprocess_html(self, soup):
return self.parse_article(soup)
# ---------------------
# custom implementation
# ---------------------
base_url = "http://www.solidot.org/"
def fetch_categories(self):
soup = self.index_to_soup(self.base_url)
items = soup.find("ul", {"class": "left_ul"}).findAll("li")
for li in items[1:]:
anchor = li.find("a")
yield anchor.string, anchor["href"]
def fetch_index(self, cat_url):
# current category is home or not
is_home = cat_url.strip("/") == self.base_url.strip("/")
# do not fetch one article two times
if not hasattr(self, "solidot_fetching_histories"):
self.solidot_fetching_histories = set()
histories = self.solidot_fetching_histories
# limit by issue date (yesterday)
cat_url = "%s?issue=%s" % (cat_url, yesterday().strftime("%Y%m%d"))
soup = self.index_to_soup(cat_url)
center_block = soup.find("div", {"id": "center"})
index_blocks = center_block.findAll("div", {"class": "block_m"})
for index_block in index_blocks:
headline = index_block.find("h2").find("a", recursive=False)
talktime = index_block.find("div", {"class": "talk_time"})
pcontent = index_block.find("div", {"class": "p_content"})
time_string = [s for s in find_strings(talktime.contents)
if s.strip()][0]
time = self.parse_time(time_string)
url = urlparse.urljoin(self.base_url, headline["href"])
description = strip_tags(pcontent).strip()
# the home should include non-tagged articles only
if is_home and index_block.find("h2").find("a") is not headline:
continue
item = {"title": headline.string,
"url": url,
"description": description,
"date": unicode(time)}
if item["url"] not in histories:
histories.add(item["url"])
yield item
#: the regex pattern to parse time
re_parse_time = re.compile(r"(?P<year>\d{4})\w"
r"(?P<month>[01][0-9])\w"
r"(?P<day>[0-3][0-9])\w"
r"\s*"
r"(?P<hour>[0-2]\d)\w"
r"(?P<minute>[0-6]\d)\w",
flags=re.UNICODE)
def parse_time(self, time_string):
match = self.re_parse_time.search(time_string.strip())
if not match:
raise ValueError(time_string)
time_fields = {name: int(match.group(name).encode("ascii"))
for name in ("year", "month", "day", "hour", "minute")}
return datetime.datetime(**time_fields)
def parse_article(self, old_soup):
new_soup = BeautifulSoup()
# merge old content
new_soup.insert(0, old_soup.find("div", {"class": "block_m"}))
# extract and parse time
talktime = new_soup.find("div", {"class": "talk_time"})
for child in talktime.findChildren():
child.extract()
time = self.parse_time("".join(unicode(c) for c in talktime.contents))
# create cleaned time tag
new_talktime = Tag(new_soup, "p", [
("class", "talk_time"),
("style", "line-height: 2em; color: #999")
])
new_talktime.insert(0, NavigableString(u"发布于 %s" % time))
talktime.replaceWith(new_talktime)
# clean useless tag
new_soup.find("div", {"class": "mid_bgtittle"}).extract()
# clean anchors
for old_anchor in new_soup.findAll("a"):
new_anchor = Tag(new_soup, "a", [("href", old_anchor["href"])])
new_anchor.insert(0, NavigableString(strip_tags(old_anchor)))
old_anchor.replaceWith(new_anchor)
return new_soup
# vim:set ft=python:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment