Created
December 9, 2013 11:33
-
-
Save yszou/7870934 to your computer and use it in GitHub Desktop.
获取内容的文件
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
'获取邮件' | |
import pickle | |
from time import time | |
from uuid import uuid4 | |
import os | |
import sys | |
import re | |
import datetime | |
from os.path import join | |
from lxml import etree | |
from functools import partial | |
import logging | |
from email.mime.text import MIMEText | |
from email.header import make_header | |
import email.utils | |
import tornado.ioloop | |
import tornado.gen | |
import tornado.httpclient | |
import tornado.escape | |
from service.pop_client import POP | |
from lib.eml_parse import parse | |
from timer.log import LocalLog | |
from config import ACCOUNT, RECORD, MAILBOX, LOCAL_LOG_DIR, DEBUG, MAX_ONCE | |
IL = tornado.ioloop.IOLoop.instance() | |
logger = logging.getLogger('AceFetch') | |
logger.propagate = False | |
logger.setLevel(logging.INFO) | |
fmt = logging.Formatter('--%(name)s|%(asctime)s|%(filename)s|%(lineno)s|%(levelname)s|%(message)s', '%m-%d %H:%M:%S') | |
stream_hd = logging.StreamHandler() | |
stream_hd.setFormatter(fmt) | |
#if sys.platform.startswith('linux'): | |
# syslog_hd = SysLogHandler(address='/dev/log', facility=SysLogHandler.LOG_LOCAL6) | |
# syslog_hd.setFormatter(fmt) | |
# syslog_hd.setLevel(logging.INFO) | |
# logger.addHandler(syslog_hd) | |
if sys.version.startswith('2.7'): | |
local_hd = logging.StreamHandler(stream=LocalLog) | |
else: | |
local_hd = logging.StreamHandler(strm=LocalLog) | |
local_hd.setFormatter(fmt) | |
local_hd.setLevel(logging.INFO) | |
logger.addHandler(local_hd) | |
if DEBUG: | |
stream_hd.setLevel(logging.INFO) | |
logger.addHandler(stream_hd) | |
def notify(count): | |
cmd = 'notify-send -i /usr/share/libindicator/icons/hicolor/32x32/categories/applications-email-panel.png -t 1000 "有 %s 封新邮件"' % count | |
os.system(cmd) | |
def get_box(msg, self): | |
list_map = { | |
'python-cn.googlegroups.com': '3-CPyUG', | |
'python-tornado.googlegroups.com': '4-Tornado', | |
'angular.googlegroups.com': '5-AngularJS', | |
} | |
to_map = { | |
'[email protected]': '3-CPyUG', | |
'[email protected]': '4-Tornado', | |
'[email protected]': '5-AngularJS', | |
} | |
from_map = { | |
'[email protected]': '6-JIRA', | |
} | |
host_user_map = { | |
('mail.xxx.com', 'xxx'): '2-SOHU-INC', | |
} | |
box = None | |
if box is None and msg['to']: | |
box = to_map.get(msg['to'][0][1], None) | |
if box is None and msg['list']: | |
box = list_map.get(msg['list'][0][1], None) | |
if box is None and msg['from']: | |
box = from_map.get(msg['from'][0][1], None) | |
if box is None: | |
box = host_user_map.get((self.host, self.user), None) | |
return box or '0-inbox' | |
class DiscuzFetch(object): | |
LAST_FILE = '' | |
LAST = (0, 0, 0) | |
HOST = 'http://example.com/forum.php' | |
FOLDER = '/home/zys/Mail/0-Example/' | |
ID = 'Example' | |
FROM = '[email protected]' | |
def __init__(self): | |
self.last = self.__class__.LAST | |
self.last_file = self.__class__.LAST_FILE | |
self.host = self.__class__.HOST | |
self.folder = self.__class__.FOLDER | |
self.id = self.__class__.ID | |
self.from_mbox = self.__class__.FROM | |
self.max = self.last[:] | |
self.client = tornado.httpclient.AsyncHTTPClient() | |
def to_send(self, to_do_obj): | |
for obj in to_do_obj: | |
obj['title'] = ('[%s] ' % self.id) + obj['title'] | |
obj['title'] = obj['title'].encode('utf8') | |
obj['user'] = obj['user'].encode('utf8') | |
msg = MIMEText(obj['html'], _subtype='html', _charset='utf-8') | |
msg['Subject'] = make_header([(obj['title'], 'utf-8')]) | |
msg['From'] = make_header([(obj['user'], 'utf-8'), ('<%s>' % self.from_mbox, None)]) | |
msg['To'] = make_header([('邹业盛', 'utf-8'), ('<[email protected]>', None)]) | |
if not obj['is_p']: | |
msg['Message-ID'] = '<Discuz-%s-%s-%s-%s-%s@BJ5544>' % (self.last[0], obj['thread'], obj['id'], self.id, self.host.encode('hex')) | |
msg['In-Reply-To'] = '<Discuz-%s-%s-0-%s-%s@BJ5544>' % (self.last[0], obj['thread'], self.id, self.host.encode('hex')) | |
msg['References'] = msg['In-Reply-To'] | |
else: | |
msg['Message-ID'] = '<Discuz-%s-%s-0-%s-%s@BJ5544>' % (self.last[0], obj['thread'], self.id, self.host.encode('hex')) | |
msg['X-URL'] = self.host + '?' + ('mod=viewthread&tid=%s&page=%s#pid%s' % (obj['thread'], obj['page'], obj['id'])) | |
now = datetime.datetime.strptime(obj['create'], '%Y-%m-%d %H:%M:%S') | |
msg['Date'] = email.utils.formatdate(int(now.strftime('%s')), localtime=True) | |
filename = join(self.folder, 'new', msg['Message-ID'].split('@', 1)[0]) | |
with open(filename, 'w') as f: | |
f.write(msg.as_string()) | |
logger.info('[Discuz]%s|%s|%s' % (obj['title'], obj['user'], obj['id'])) | |
@tornado.gen.engine | |
def check_thread(self, thread, callback): | |
to_do_html = [] | |
url = self.host + '?' + ('mod=redirect&tid=%s&goto=lastpost' % thread) | |
res = yield tornado.gen.Task(self.client.fetch, url) | |
if res.error: | |
IL.add_timeout(int(time()) + 5, partial(self.check_thread, thread, callback)) | |
return | |
root = etree.HTML(res.body) | |
title = root.xpath('.//div[@id="pt"]/div/a')[-1].text | |
sum_page = re.findall('page=(\d+)', res.effective_url) | |
sum_page = int(sum_page[0]) if sum_page else 1 | |
page = sum_page | |
is_continue = True | |
while 1: | |
if page == 0: | |
break | |
url = self.host + '?' + ('mod=viewthread&tid=%s&page=%s' % (thread, page)) | |
while 1: | |
res = yield tornado.gen.Task(self.client.fetch, url) | |
if res.error: | |
continue | |
root = etree.HTML(res.body) | |
break | |
post_list = root.xpath('.//div[@id="postlist"]/div') | |
post_list.reverse() | |
for i, p in enumerate(post_list): | |
id = p.get('id', '') | |
if not id.startswith('post_'): | |
continue | |
id = int(id.split('_', 1)[1]) | |
if id <= self.last[2]: | |
#最后一页, 最后一个(post_list中有一个不是帖子) | |
if page == sum_page and i == 1: | |
is_continue = False | |
page = 1 | |
break | |
user = p.xpath('.//a[@class="xw1"]')[0].text | |
create = p.xpath('.//em[@id="authorposton%s"]/span' % id) | |
if create: | |
create = create[0].get('title') | |
else: | |
create = p.xpath('.//em[@id="authorposton%s"]' % id)[0].text.split(' ', 1)[1] | |
html = p.xpath('.//td[@id="postmessage_%s"]' % id)[0] | |
html.tag = 'div'; del html.attrib['id']; del html.attrib['class'] | |
html = etree.tostring(html, method='html', encoding='utf8') | |
is_p = True if p.xpath('.//div[@id="p_btn"]') else False | |
to_do_html.append({ | |
'id': id, | |
'thread': thread, | |
'user': user, | |
'create': create, | |
'html': html, | |
'title': title if is_p else title, | |
'page': page, | |
'is_p': is_p, | |
}) | |
if id > self.max[-1]: | |
self.max = (self.last[0], thread, id) | |
page -= 1 | |
to_do_html.reverse() | |
self.to_send(to_do_html) | |
callback(is_continue) | |
@tornado.gen.engine | |
def run(self): | |
page = 0 | |
while 1: | |
page += 1 | |
url = self.host + '?' + ('mod=forumdisplay&filter=lastpost&orderby=lastpost&fid=%s&page=%s' % (self.last[0], page)) | |
res = yield tornado.gen.Task(self.client.fetch, url) | |
if res.error: | |
IL.add_timeout(int(time()) + 30, self.run) | |
return | |
root = etree.HTML(res.body) | |
tb_list = root.xpath('.//form[@id="moderate"]/table/tbody') | |
if not tb_list: | |
IL.add_timeout(int(time()) + 30, self.run) | |
return | |
for tb in tb_list: | |
id = tb.get('id', '') | |
if id.startswith('normalthread'): | |
thread = int(id.split('_', 1)[1]) | |
is_continue = yield tornado.gen.Task(self.check_thread, thread) | |
if not is_continue: | |
self.last = self.max[:] | |
with open(self.last_file, 'w') as f: | |
f.write(tornado.escape.json_encode(self.last).encode('utf8')) | |
IL.add_timeout(int(time()) + 30, self.run) | |
return | |
class CloudBBS(DiscuzFetch): | |
LAST_FILE = '/home/zys/Dropbox/last/cloudbbs.org' | |
with open(LAST_FILE, 'r') as f: | |
data = f.read().strip() | |
data = tornado.escape.json_decode(data) | |
LAST = tuple(data) | |
HOST = 'http://cloudbbs.org/forum.php' | |
FOLDER = '/home/zys/Mail/8-Discuz/' | |
ID = 'CloudBBS' | |
FROM = '[email protected]' | |
class CloudBBS_41(DiscuzFetch): | |
LAST_FILE = '/home/zys/Dropbox/last/cloudbbs-41.org' | |
with open(LAST_FILE, 'r') as f: | |
data = f.read().strip() | |
data = tornado.escape.json_decode(data) | |
LAST = tuple(data) | |
HOST = 'http://cloudbbs.org/forum.php' | |
FOLDER = '/home/zys/Mail/8-Discuz/' | |
ID = 'CloudBBS' | |
FROM = '[email protected]' | |
class Anshida(DiscuzFetch): | |
LAST = (74, 495699, 1714985-1) | |
HOST = 'http://www.anshida.net/BBS/forum.php' | |
FOLDER = '/home/zys/Mail/8-Discuz/' | |
ID = 'Anshida' | |
FROM = '[email protected]' | |
class Fetch(object): | |
def __init__(self, host, port, user, password, ssl=False, name=None, *args, **kargs): | |
self.host = host | |
self.port = port | |
self.user = user | |
self.password = password | |
self.ssl = ssl | |
self.name = name | |
self.interval = kargs['all']['interval'] | |
self.timer = None | |
def run(self): | |
logger.info('%s|CHECK' % self.name) | |
self.pop = POP(self.host, self.port, self.user, self.password, self.fetch, self.ssl) | |
def parse_body(self, body): | |
body = body.split('\n', 1)[1].lstrip() | |
msg = parse(body) | |
box = get_box(msg, self) | |
id = '%s.%s' % (int(time()), uuid4().hex) | |
file = join(MAILBOX, box, 'new', id) | |
if 'corp_bizweb' not in msg['subject']: | |
with open(file, 'wb') as f: | |
f.write(msg['source'].read()) | |
return msg, box | |
@tornado.gen.engine | |
def fetch(self, pop): | |
data = yield tornado.gen.Task(pop.uidl, '') | |
mail_list = [x.split(' ', 1) for x in data.split('\r\n')][1:-1] | |
mail_list.reverse() | |
if os.access(RECORD, os.F_OK): | |
with open(RECORD, 'rb') as f: | |
in_db = pickle.load(f).get(self.name, set([])) | |
else: | |
in_db = set([]) | |
count = 0 | |
new = 0 | |
for id, uidl in mail_list: | |
count += 1 | |
if uidl in in_db: | |
continue | |
if MAX_ONCE > 0 and count > MAX_ONCE: | |
break | |
body = yield tornado.gen.Task(pop.retr, id) | |
msg, box = self.parse_body(body) | |
new += 1 | |
in_db.add(uidl) | |
logger.info('%s|%s -> %s, %s/%s' % (self.name, msg['subject'].encode('utf8'), box, count, len(mail_list))) | |
if new: | |
notify(new) | |
if os.access(RECORD, os.F_OK): | |
with open(RECORD, 'rb') as f: | |
all = pickle.load(f) | |
all[self.name] = in_db | |
else: | |
all = {self.name: in_db} | |
with open(RECORD, 'wb') as f: | |
pickle.dump(all, f) | |
data = yield tornado.gen.Task(pop.quit) | |
logger.info('%s|%s|COMPLETE' % (self.name, data)) | |
self.timer = IL.add_timeout(int(time()) + self.interval, self.run) | |
if __name__ == '__main__': | |
for k, v in ACCOUNT.items(): | |
Fetch(v['host'], v['port'], v['user'], v['pass'], v['ssl'], k, all=v).run() | |
CloudBBS().run() | |
CloudBBS_41().run() | |
#Anshida().run() | |
LocalLog(1000 * 10, LOCAL_LOG_DIR).start() | |
IL.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment