Created
February 25, 2013 14:05
-
-
Save Wizmann/5029979 to your computer and use it in GitHub Desktop.
一个很挫很挫的抓取BeijingAir的程序
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import urllib | |
import urllib2 | |
import cookielib | |
import httplib | |
import json | |
import re | |
import time | |
import datetime | |
import os | |
import sys | |
import socket | |
import sqlite3 | |
import logging | |
import renren | |
import HTMLParser | |
import MySQLdb | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
class AirParser(HTMLParser.HTMLParser): | |
def __init__(self,callback): | |
HTMLParser.HTMLParser.__init__(self) | |
self.ready=False | |
self.callback=callback | |
def handle_starttag(self,tag,attrs): | |
if(tag=='p' and attrs==[('class','js-tweet-text')]): | |
self.ready=True | |
try: | |
if(tag=='span' and 'data-time' in zip(*attrs)[0]): | |
for key,value in attrs: | |
if key=='data-time': | |
self.callback(value,'time') | |
except: | |
pass | |
def handle_data(self,data): | |
if(self.ready and data.strip()!=''): | |
#print '>>' | |
#print data.strip() | |
self.callback(data.strip(),'context') | |
self.ready=False | |
def get_log(): | |
log = logging.getLogger('weibo') | |
handler = logging.FileHandler('beijing_air.log','a') | |
fmt = logging.Formatter("%(levelname)-8s %(asctime)-15s [%(filename)s,%(lineno)d] %(message)s") | |
handler.setFormatter(fmt) | |
log.addHandler(handler) | |
log.setLevel(logging.DEBUG) | |
return log | |
def check_updated(air): | |
sql_query='SELECT `id` FROM `air` WHERE `addtime`=%d;' | |
sql_insert='INSERT INTO `air` (`addtime`,`context`) VALUES (%d,\'%s\');' | |
conn=sqlite3.connect('beijing_air.sqlite') | |
cursor=conn.cursor() | |
show=[] | |
for item in air: | |
cursor.execute(sql_query % item[1]) | |
if(len(cursor.fetchall())==0): | |
show.append(item[0]) | |
cursor.execute(sql_insert % (item[1],MySQLdb.escape_string(item[0]))) | |
cursor.close() | |
conn.commit() | |
conn.close() | |
return show[::-1][:3][::-1] | |
def show(addr): | |
logger=get_log() | |
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11' | |
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar())) | |
urllib2.install_opener(opener) | |
req = urllib2.Request(addr) | |
req.add_header('User-Agent',user_agent) | |
logger.info("try to fetch page...") | |
content = urllib2.urlopen(req,timeout=60).read() | |
with open(os.path.join('/tmp',str(datetime.datetime.now().strftime("%Y-%m-%d_%H:%M"))+'.txt'),'w') as pagefile: | |
pagefile.write(content) | |
air=[] | |
airtime=[] | |
def air_callback(item,ttype): | |
if(ttype=='context'): | |
air.append(item) | |
else: | |
airtime.append(int(item)) | |
air_parser=AirParser(air_callback) | |
air_parser.feed(content) | |
air=zip(air,airtime) | |
for item in check_updated(air[::-1]): | |
print item | |
logger.info(item) | |
renren.show(item+'(@BeijingAir)') | |
time.sleep(30) | |
if(__name__=='__main__'): | |
for i in xrange(30): | |
print 'attempted %d...' % (i+1) | |
try: | |
show('https://twitter.com/beijingair') | |
time.sleep(30) | |
show('http://twitter.com/beijingair') | |
break | |
except Exception,e: | |
print e |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment