Last active
December 24, 2015 23:29
-
-
Save olsososo/6880887 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
# import gevent.monkey | |
# gevent.monkey.patch_all() | |
import gevent | |
import MySQLdb | |
import os | |
import re | |
import requests | |
import shutil | |
import time | |
from BeautifulSoup import BeautifulSoup | |
from xml.dom.minidom import parseString | |
from xml.parsers.expat import ExpatError | |
from gevent.queue import Empty, Queue | |
from urlparse import urlparse, parse_qs | |
class C(object): | |
def __init__(self,url): | |
self.url = url | |
self.insert() | |
def spider(self): | |
r = requests.get(self.url) | |
text = r.text.encode('utf-8','//ignore') | |
p = re.compile(r'<a style="text-decoration: none; color: rgb\(0, 0, 0\); " rel="nofollow" href="(.*?)" >(.*?)</a>') | |
f = open("rss.txt",'a') | |
for m in p.finditer(text): | |
print m.group(2)+m.group(1) | |
f.write(m.group(2)+" "+m.group(1)+"\n") | |
f.close() | |
def insert(self): | |
f = open("rss.txt",'r') | |
conn = MySQLdb.connect(host='localhost',user='root',passwd='',db='r',port=3306,charset="utf8") | |
cur = conn.cursor() | |
oid = 0 | |
id = 0 | |
for l in f.readlines(): | |
if l.strip().isdigit(): | |
cid = l.strip() | |
else: | |
n,u = l.split(" ",1) | |
try: | |
r = requests.get(u.strip(),timeout=20) | |
except Exception as e: | |
continue | |
if r.status_code != 200: | |
continue | |
oid = oid+1 | |
try: | |
text = r.text.encode('utf-8','//ignore') | |
c = re.compile(r'<description>([\s|\S]*?)</description>') | |
d = c.search(text).group(1) | |
if d.strip() == "": | |
c = re.compile(r'<title>([\s|\S]*?)</title>') | |
d = c.search(text).group(1) | |
except: | |
d = "" | |
c = re.compile(r'(http://.*?\.(com|cn|info|me|org|net|uk)/).*?') | |
domain = c.search(u).group(0) | |
try: | |
response = requests.get(domain+"favicon.ico", stream=True) | |
with open(os.getcwd()+"/icon/"+str(oid)+".ico", 'wb') as out_file: | |
shutil.copyfileobj(response.raw, out_file) | |
del response | |
icon = "1@icon/"+str(oid)+".ico" | |
except Exception as e: | |
icon = "" | |
try: | |
cur.execute("insert into source (title,description,icon,url,cid,orderid,islock) values ('%s','%s','%s','%s','%s','%s','%s')" %(n.strip(),d.strip(),icon,u.strip(),cid,oid,0)) | |
except Exception as e: | |
print cur._executed | |
class R(object): | |
def __init__(self,url): | |
self.url = url | |
def category(): | |
r = requests.get("http://gate.guokr.com/") | |
soup = BeautifulSoup(r.content) | |
categorys_hd = soup.findAll("div",{"class":"categorys-hd fix"}) | |
f = open("categories.txt",'w') | |
for item in categorys_hd: | |
category_soup = BeautifulSoup(item.prettify()) | |
category = category_soup.findAll("div",{"class":"category"}) | |
f.write("P:\n") | |
for c in category: | |
s = BeautifulSoup(c.prettify()) | |
titles = s.findAll("h3") | |
for title in titles: | |
f.write("S:"+title.text.encode('utf-8','//ignore')+"\n") | |
sites = s.findAll("li") | |
for site in sites: | |
f.write(site.text.encode('utf-8','//ignore')+" "+site.a['href'].encode('utf-8')+"\n") | |
f.close() | |
def findRss(): | |
url = "http://www.domain.com" | |
r = requests.get(url) | |
soup = BeautifulSoup(r.content) | |
a = soup.find("a",href = re.compile("(rss|atom|feed|xml)",re.I)) | |
if a is not None: | |
if a['href'].startswith('/'): | |
if url.endswith('/'): | |
url = url[0:-1] | |
href = url+a['href'] | |
else: | |
href = a['href'] | |
print href | |
def xml(): | |
url = 'http://www.domain.com/index.xml' | |
r = requests.get(url) | |
try: | |
parseString(r.content) | |
return True | |
except ExpatError: | |
return False | |
class F(object): | |
def __init__(self,url,filePath,thread_num=5,timeout=5): | |
self.url = url | |
self.queue = Queue() | |
self.timeout = timeout | |
self.file = open(filePath,'w') | |
self.jobs = [gevent.spawn(self.doScheduler)] | |
self.jobs.extend([gevent.spawn(self.doWorker) for i in xrange(thread_num)]) | |
self.start() | |
def start(self): | |
gevent.joinall(self.jobs) | |
def doScheduler(self): | |
r = requests.get(self.url,timeout=10) | |
soup = BeautifulSoup(r.content) | |
sidebar = soup.find('div',{'id':'sidebar'}) | |
sidebar = BeautifulSoup(sidebar.prettify()) | |
more = sidebar.findAll('a',{'href':re.compile('more.php\?id=\d*')}) | |
categoryid = 1 | |
for m in more: | |
href = self.url+str(m['href']) | |
r = requests.get(href,timeout=10) | |
soup = BeautifulSoup(r.content) | |
navi = soup.find('div',{'id':'navi'}) | |
navi = BeautifulSoup(navi.prettify()) | |
try: | |
pagesize = navi('a')[-2] | |
except IndexError: | |
pagesize = navi('a')[-1] | |
except Exception as e: | |
print e | |
continue | |
for i in xrange(int(pagesize.text)): | |
item = categoryid,m.text.replace('&','&'),re.sub('page=\d*','page='+str(i+1),self.url+pagesize['href']) | |
self.queue.put(item) | |
categoryid = categoryid + 1 | |
def doWorker(self): | |
while True: | |
try: | |
categoryid,category,url = self.queue.get(timeout=self.timeout) | |
r = requests.get(url,timeout=15) | |
soup = BeautifulSoup(r.content) | |
posts = soup.findAll('div',{'class':'post'}) | |
for post in posts: | |
soup = BeautifulSoup(post.prettify()) | |
title = soup.find('h2').text | |
site = soup('a')[-1]['href'] | |
source = soup.find('a',href=re.compile('^http://findex.cn/subscripe.php')) | |
o = urlparse(source['href']) | |
params = parse_qs(o.query) | |
url = params['url'][0] | |
self.file.write(category.encode('utf-8').strip()+'###'+title.encode('utf-8').strip()+'###'+url.encode('utf-8').strip()+'###'+site.encode('utf-8').strip()+"\n") | |
print url | |
except Empty: | |
self.file.close() | |
return | |
except Exception as e: | |
print e | |
class SoureHandler(object): | |
def __init__(self,file_path): | |
self.file = file_path | |
self.DB_Session = DB_Session() | |
self.pid = 0 | |
self.categoryid = 0 | |
self.bucket_name = bucket_name | |
self.uptoken = uptoken | |
self.worker() | |
def worker(self): | |
f = open(self.file,'r') | |
for line in f.readlines(): | |
if line.startswith('C:'): | |
_, category_title = line.split(':') | |
category = Category(pid=0,title=category_title.strip(),description='',photo='',orderid=0,islock=0) | |
self.DB_Session.add(category) | |
self.DB_Session.commit() | |
self.pid = category.id | |
elif line.startswith('S:'): | |
_,category_title = line.split('S:') | |
category = Category(pid=self.pid,title=category_title.strip(),description='',photo='',orderid=0,islock=0) | |
self.categoryid = self.DB_Session.add(category) | |
self.DB_Session.commit() | |
self.categoryid = category.id | |
else: | |
soure_title,url = line.split(' ',1) | |
soure_title = soure_title.strip() | |
url = url.strip() | |
link = self.findRss(url) | |
if link is not None and self.verifyRss(link) == True: | |
try: | |
r = requests.get(link,timeout=15) | |
po = re.compile(r'<description>([\s|\S]*?)</description>') | |
pt = re.compile(r'<title>([\s|\S]*?)</title>') | |
description = po.search(r.content).group(1) or pt.search(r.content).group(1) | |
except: | |
description = "" | |
if url.endswith('/') == False: url = str(url) + '/' | |
try: | |
response = requests.get(url+"favicon.ico", stream=True) | |
key = "categoryid"+str(self.categoryid)+'_'+str(int(time.time()*100000))+'.ico' | |
ret, err = qiniu.io.put(self.uptoken, key, data=response.raw) | |
if err is not None: | |
print err | |
return | |
icon = "http://%s.u.qiniudn.com/%s" % (self.bucket_name, key) | |
except Exception: | |
icon = "" | |
source = Source(title=soure_title,description=description.strip(),icon=icon,url=link,cid=self.categoryid,orderid=0,islock=0) | |
self.DB_Session.add(source) | |
self.DB_Session.commit() | |
def findRss(self,url): | |
link = None | |
try: | |
r = requests.get(url,timeout=15) | |
except Exception: | |
return None | |
soup = BeautifulSoup(r.content) | |
a = soup.find("a",href = re.compile("(rss|atom|feed|xml)")) | |
if a is not None: | |
if a['href'].startswith('/'): | |
if url.endswith('/'): | |
url = url[0:-1] | |
link = str(url)+a['href'] | |
else: | |
link = a['href'] | |
return link | |
def verifyRss(self,url): | |
try: | |
r = requests.get(url,timeout=15) | |
parseString(r.content) | |
return True | |
except Exception: | |
return False | |
if __name__ == "__main__": | |
f = F('http://findex.cn/','rss.txt') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment