Created
March 24, 2012 11:44
-
-
Save lisongx/2181456 to your computer and use it in GitHub Desktop.
获得最see.xidian.edu.cn近几日的新闻
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import requests | |
from lxml import html | |
from datetime import datetime, timedelta | |
# get news in last X days | |
DAY = 3 | |
# url of the news page | |
BASE_URL = "http://see.xidian.edu.cn/" | |
# get the page content | |
r = requests.get(BASE_URL) | |
# override the encoding | |
r.encoding = 'GBK' | |
doc = html.document_fromstring(r.text) | |
today = datetime.today() | |
def get_today_news(): | |
trs = doc.cssselect('table[width="98%"] tr') | |
for tr in trs: | |
img_tag = tr.cssselect('img') | |
date_tag = tr.cssselect('font') | |
if img_tag and date_tag: | |
tr_date = date_tag[0].text_content().strip('[]') | |
tr_timedelta = today - datetime.strptime(tr_date, "%Y-%m-%d") | |
if tr_timedelta < timedelta(DAY): | |
link = tr.cssselect('a')[0].get('href') | |
print tr.text_content(), BASE_URL + link | |
if __name__ == '__main__': | |
get_today_news() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment