Last active
March 28, 2020 06:56
-
-
Save c4pt0r/1a12e8357e430a3f9cc29a16a186de36 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import time | |
import slack | |
import os | |
import sqlite3 | |
from requests_html import HTMLSession | |
client = slack.WebClient(os.environ.get('SLACK_BOT_TOKEN')) | |
session = HTMLSession() | |
db_conn = sqlite3.connect('item.db') | |
def init_db(): | |
cur = db_conn.cursor() | |
cur.executescript(""" | |
create table if not exists item( url text not null, content text not null, | |
create_at text not null | |
); | |
create unique index if not exists idx_item_url | |
on item(url); | |
""") | |
def append_item_to_db(item): | |
keys = ','.join(item.keys()) | |
question_marks = ','.join(list('?' * len(item))) | |
values = tuple(item.values()) | |
db_conn.execute('insert into item (' + keys + ') values (' + question_marks + ')', values) | |
db_conn.commit() | |
def is_seen_url(url): | |
cur = db_conn.cursor() | |
cur.execute('select * from item where url=?', (url,)) | |
item = cur.fetchone() | |
cur.close() | |
return item != None | |
def is_seen_item(item): | |
return is_seen_url(item['url']) | |
def has_keyword(content, keyword): | |
content = content.lower() | |
keyword = keyword.lower() | |
return content.find(keyword) != -1 | |
def get_feeds(keyword, duration='1d'): | |
response = session.get('https://www.zhihu.com/search?q=%s&type=content&range=%s' % (keyword, duration)) | |
content = response.html.find('div.List', first=True) | |
items = content.find('div.List-item') | |
result = [] | |
for item in items: | |
content = item.text | |
if not has_keyword(content, keyword): | |
continue | |
title = item.find('h2.ContentItem-title',first=True) | |
if not title: | |
continue | |
url = title.find('a', first=True).attrs['href'] | |
if url.startswith('//'): | |
url = 'https:' + url | |
elif url.startswith('/'): | |
url = 'https://zhihu.com' + url | |
result.append({'content': content, 'url':url, 'create_at': time.ctime()}) | |
return result | |
def loop(keywords = ['pingcap', 'tikv', 'tidb', 'chaos-mesh', 'chaos mesh']): | |
while True: | |
for keyword in keywords: | |
ret = get_feeds(keyword,'3m') | |
for item in ret: | |
if not is_seen_item(item): | |
append_item_to_db(item) | |
client.chat_postMessage( | |
channel='#bot-playground', | |
text=item['content'] + ' via ' + item['url']) | |
print(item) | |
time.sleep(3) | |
if __name__ == '__main__': | |
init_db() | |
loop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment