Last active
June 6, 2016 16:24
-
-
Save iamued/8429889 to your computer and use it in GitHub Desktop.
Cnzz api with python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'richie' | |
# -*- coding: utf-8 -*- | |
import urllib, urllib2, cookielib, re, os, codecs, time, subprocess,sys,json | |
#import simplejson as json | |
import dateutil | |
class cnzz(object): | |
def __init__(self, username = '', password = '',othername=''): | |
self.__username = username | |
self.__othername = othername | |
self.__password = password | |
self.__opener = '' | |
self.__sitelist = [] | |
def login(self): | |
myCookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar()); | |
self.__opener = urllib2.build_opener(myCookie) | |
post_data = { | |
'username': self.__username, | |
'password': self.__password | |
} | |
req = urllib2.Request('http://new.cnzz.com/user/login.php', urllib.urlencode(post_data)) | |
#print 'var _username = "'+self.__username+'";' | |
loginhtml= self.__opener.open(req).read() | |
#print loginhtml | |
if(loginhtml.find('_username') > 1): | |
#self.__opener=opener | |
return True | |
else: | |
return False | |
def getSiteListPageCount(self): | |
url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&everypage=30&setpage" | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
#print html | |
match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html) | |
if match: | |
#print match.group('pagecount') | |
return int(match.group('pagecount')) | |
#html.find('第1/3页') | |
#html=self.__opener(urllib2.Request(url)).read() | |
#print html | |
def getSiteList(self): | |
pagecount=self.getSiteListPageCount() | |
print "count page :"+str(pagecount) | |
for i in range(1,pagecount+1): | |
url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&page="+str(i)+"&everypage=30" | |
#print url | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
#print html | |
match2=re.compile(r'<script>site_data\(\'(.+)\'\);</script>').findall(html) | |
match3=re.compile(r'<div class="col-1">(?P<sitename>.+)').findall(html) | |
if(match2 and match3): | |
#print match2 | |
#print len(match2) | |
#print len(match3) | |
#print match3 | |
for i in range(0,len(match3)): | |
print match3[i][0:-1]+"@@@"+match2[i] | |
self.__sitelist.append(match3[i][0:-1]+"@@@"+match2[i]) | |
else: | |
print 'getSiteList Error' | |
sys.exit() | |
print 'getSiteList OK' | |
return self.__sitelist | |
def yesterdayinfo(self,siteid=''): | |
if(siteid != ''): | |
url ="http://new.cnzz.com/v1/data/site_list_data.php?siteid="+siteid | |
req=urllib2.Request(url) | |
try: | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
except : | |
return self.yesterdayinfo(siteid) | |
#print siteid+'ok' | |
#data=eval(html)[1] | |
data=json.loads(html) | |
#print data | |
return data | |
#data=json.loads("{"+html+"}") | |
#print data | |
else: | |
print 'no siteid' | |
def getSiteInfoByDate(self,siteid='',startdate='',enddate=''): | |
#url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate | |
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
siteinfo={} | |
for i in reversed(dateutil.getDays(startdate, enddate)): | |
pn = re.compile( | |
r'<td>' + i + '</td>\s+<td class="num1">(?P<pv>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>' | |
, re.I) | |
mn = pn.search(html) | |
#print html_src2.decode('gbk').encode('utf-8') | |
#sitesinfo[key][i] = mn.group('uv') | |
#print i+"uv:"+mn.group('uv') | |
if(mn): | |
siteinfo[i.replace('星期六','').replace('星期天','')]=[mn.group('pv'),mn.group('uv'),mn.group('ip')] | |
#print siteinfo | |
return siteinfo | |
def getHistoryKeyByDate(self,siteid='',startdate='',enddate='',page=1): | |
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate | |
print url | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html) | |
#pagecount=1 | |
if match: | |
#print match.group('pagecount') | |
pagecount= int(match.group('pagecount')) | |
# match2=re.compile(r'<td title=\'(.+)\'>').findall(html) | |
#for i in range(0,len(match2)): | |
# print match2[i] | |
pagecount=3 | |
keyinfos=[] | |
for i in range(1,pagecount+1): | |
print '正在抓取关键词列表第'+str(i)+'页' | |
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate +"&page="+str(i) | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk','ignore').encode('utf-8') | |
mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>') | |
mn=mt.findall(html) | |
for i in range(0,len(mn)): | |
#print mn[i][0],'\t',mn[i][2],'\t',mn[i][3] | |
keyinfos.append([mn[i][0],mn[i][2],mn[i][3]]) | |
''' | |
<td title='www.jxeea.cn '>www.jxeea.cn</td> | |
<td class='all_right'>38024</td> | |
<td class='all_right'>29859</td> | |
<td class='all_right'>28771</td> | |
<td class='all_right'>24385</td> | |
''' | |
print '共抓取关键词'+str(len(keyinfos))+'个' | |
return keyinfos | |
#print html | |
def getKeyHistory(self,siteid='',startdate='',enddate='',key=''): | |
#通过key查询时间段内最高,和最低的搜索量 | |
#http://new.cnzz.com/v1/main.php?siteid=2918556&s=history_key&st=2013-04-08&et=2013-05-07&t=30&url=%C9%EE%DB%DA%BB%E1%BC%C6%B4%D3%D2%B5%D7%CA%B8%F1%BF%BC%CA%D4%B1%A8%C3%FB%CF%B5%CD%B3 | |
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=history_key&st=" + startdate + "&et=" + enddate+"&url="+urllib2.quote(key.decode('utf-8').encode('gbk')) | |
#print url | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
#print html | |
# match2=re.compile(r'<td title=\'(.+)\'>').findall(html) | |
#for i in range(0,len(match2)): | |
# print match2[i] | |
#mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>') | |
#mn=mt.findall(html) | |
datelist=reversed(dateutil.getDays2(startdate, enddate)) | |
uvlist={} | |
for i in datelist: | |
#print i | |
pn = re.compile( | |
r'<td> '+i+' </td>\s+<td class="num1">(?P<snum>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>' | |
#\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>\s+<td class="num2">(?P<newuv>.+)</td> | |
, re.I) | |
mn = pn.search(html) | |
#print html_src2.decode('gbk').encode('utf-8') | |
#sitesinfo[key][i] = mn.group('uv') | |
#print i+"uv:"+mn.group('uv') | |
#print mn | |
if(mn): | |
#siteinfo[i.replace('星期六','').replace('星期天','')]= | |
#print i,'\t',mn.group('snum'),'uv:',mn.group('uv') | |
uvlist[i.replace('星期六','').replace('星期天','')]=int(mn.group('uv')) | |
#print uvlist | |
#print 'max:',self.sort_by_value(uvlist)[-1],'min:',self.sort_by_value(uvlist)[0] | |
return [self.sort_by_value(uvlist)[-1],self.sort_by_value(uvlist)[0]] | |
def sort_by_value(self,d): | |
return sorted(d.items(), key=lambda d:d[1]) | |
''' | |
if __name__ == '__main__': | |
CnzzTool=cnzz('username','password','') | |
if(CnzzTool.login()): | |
print "LoginOk" | |
else: | |
print "LoginError" | |
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'richie' | |
# -*- coding: utf-8 -*- | |
import urllib, urllib2, cookielib, re, os, codecs, time, subprocess,sys,json,math | |
#import simplejson as json | |
import dateutil | |
class cnzz(object): | |
def __init__(self, username = '', password = '',othername=''): | |
self.__username = username | |
self.__othername = othername | |
self.__password = password | |
self.__opener = '' | |
self.__sitelist = [] | |
def login(self): | |
myCookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar()); | |
self.__opener = urllib2.build_opener(myCookie) | |
post_data = { | |
'username': self.__username, | |
'password': self.__password | |
} | |
req = urllib2.Request('http://new.cnzz.com/user/login.php', urllib.urlencode(post_data)) | |
#print 'var _username = "'+self.__username+'";' | |
loginhtml= self.__opener.open(req).read() | |
#print loginhtml#.decode('gbk').encode('utf-8') | |
if(loginhtml.find('_username') > 1 or loginhtml.find('登陆进入旧版站长')>1): | |
#self.__opener=opener | |
return True | |
else: | |
return False | |
def getSiteListPageCount(self): | |
url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search=¤tPage=1&pageType=30&_="+str(int(time.time())) | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
totalsite= int(json.loads(html)['data']['gettotallist']['totalsite']) | |
#print (155/90.0) | |
return math.ceil(totalsite/90.0) | |
#print html | |
#match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html) | |
#if match: | |
#print match.group('pagecount') | |
# return int(match.group('pagecount')) | |
#html.find('第1/3页') | |
#html=self.__opener(urllib2.Request(url)).read() | |
#print html | |
def getuserdetail(self,url): | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
sitelist=json.loads(html) | |
print html | |
return sitelist | |
def getSiteList(self): | |
pagecount=int(self.getSiteListPageCount()) | |
print "count page :"+str(pagecount) | |
for i in range(1,pagecount+1): | |
#url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&page="+str(i)+"&everypage=30" | |
url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search=¤tPage="+str(i)+"&pageType=90&_=1385011097947" | |
#print url | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
sitelist=json.loads(html) | |
sitelist=sitelist['data']['list']['items'] | |
for x in range(len(sitelist)): | |
#print sitelist[x]['name']+"@@@"+sitelist[x]['siteid'] | |
#print x | |
self.__sitelist.append(sitelist[x]['name']+"@@@"+sitelist[x]['siteid']) | |
#print html | |
#match2=re.compile(r'<script>site_data\(\'(.+)\'\);</script>').findall(html) | |
#match3=re.compile(r'<div class="col-1">(?P<sitename>.+)').findall(html) | |
#if(match2 and match3): | |
# #print match2 | |
# #print len(match2) | |
# #print len(match3) | |
# #print match3 | |
# for i in range(0,len(match3)): | |
# print match3[i][0:-1]+"@@@"+match2[i] | |
# self.__sitelist.append(match3[i][0:-1]+"@@@"+match2[i]) | |
#else: | |
# print 'getSiteList Error' | |
# sys.exit() | |
print 'getSiteList OK' | |
return self.__sitelist | |
def yesterdayinfo(self,siteid=''): | |
if(siteid != ''): | |
#url ="http://new.cnzz.com/v1/data/site_list_data.php?siteid="+siteid | |
url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search="+siteid+"¤tPage=1&pageType=90&_=1385012521584" | |
req=urllib2.Request(url) | |
try: | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
except : | |
return self.yesterdayinfo(siteid) | |
siteinfo= json.loads(html)['data']['list']['items'] | |
siteyinfo=[(),()] | |
if(len(siteinfo)==1): | |
#print siteinfo[0]['y_uv'] | |
siteyinfo[1]=[siteinfo[0]['y_pv'],siteinfo[0]['y_uv'],siteinfo[0]['y_ip']] | |
#print siteyinfo | |
#print siteid+'ok' | |
#data=eval(html)[1] | |
#data=json.dumps(html) | |
#print data | |
#print data | |
return list(siteyinfo) | |
#data=json.loads("{"+html+"}") | |
#print data | |
else: | |
print 'no siteid' | |
def getSiteInfoByDate(self,siteid='',startdate='',enddate=''): | |
#url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate | |
#url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate | |
url ="http://tongji.cnzz.com/main.php?c=flow&a=trend&ajax=module%3Dsummary%7Cmodule%3DfluxList_currentPage%3D1_pageType%3D90&siteid="+siteid+"&st="+ startdate +"&et="+ enddate+"&_=1385013202955" | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
siteinfoitems= json.loads(html)['data']['fluxList']['items'] | |
siteinfo={} | |
for x in range(len(siteinfoitems)): | |
print siteinfoitems[x]['key'] | |
siteinfo[siteinfoitems[x]['key']]=[siteinfoitems[x]['pv'],siteinfoitems[x]['uv'],siteinfoitems[x]['ip']] | |
#print siteinfo | |
#exit() | |
return siteinfo | |
def getHistoryKeyByDate(self,siteid='',startdate='',enddate='',page=1): | |
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate | |
print url | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html) | |
#pagecount=1 | |
if match: | |
#print match.group('pagecount') | |
pagecount= int(match.group('pagecount')) | |
# match2=re.compile(r'<td title=\'(.+)\'>').findall(html) | |
#for i in range(0,len(match2)): | |
# print match2[i] | |
pagecount=3 | |
keyinfos=[] | |
for i in range(1,pagecount+1): | |
print '正在抓取关键词列表第'+str(i)+'页' | |
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate +"&page="+str(i) | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk','ignore').encode('utf-8') | |
mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>') | |
mn=mt.findall(html) | |
for i in range(0,len(mn)): | |
#print mn[i][0],'\t',mn[i][2],'\t',mn[i][3] | |
keyinfos.append([mn[i][0],mn[i][2],mn[i][3]]) | |
''' | |
<td title='www.jxeea.cn '>www.jxeea.cn</td> | |
<td class='all_right'>38024</td> | |
<td class='all_right'>29859</td> | |
<td class='all_right'>28771</td> | |
<td class='all_right'>24385</td> | |
''' | |
print '共抓取关键词'+str(len(keyinfos))+'个' | |
return keyinfos | |
#print html | |
def getKeyHistory(self,siteid='',startdate='',enddate='',key=''): | |
#通过key查询时间段内最高,和最低的搜索量 | |
#http://new.cnzz.com/v1/main.php?siteid=2918556&s=history_key&st=2013-04-08&et=2013-05-07&t=30&url=%C9%EE%DB%DA%BB%E1%BC%C6%B4%D3%D2%B5%D7%CA%B8%F1%BF%BC%CA%D4%B1%A8%C3%FB%CF%B5%CD%B3 | |
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=history_key&st=" + startdate + "&et=" + enddate+"&url="+urllib2.quote(key.decode('utf-8').encode('gbk')) | |
#print url | |
req=urllib2.Request(url) | |
html= self.__opener.open(req).read().decode('gbk').encode('utf-8') | |
#print html | |
# match2=re.compile(r'<td title=\'(.+)\'>').findall(html) | |
#for i in range(0,len(match2)): | |
# print match2[i] | |
#mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>') | |
#mn=mt.findall(html) | |
datelist=reversed(dateutil.getDays2(startdate, enddate)) | |
uvlist={} | |
for i in datelist: | |
#print i | |
pn = re.compile( | |
r'<td> '+i+' </td>\s+<td class="num1">(?P<snum>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>' | |
#\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>\s+<td class="num2">(?P<newuv>.+)</td> | |
, re.I) | |
mn = pn.search(html) | |
#print html_src2.decode('gbk').encode('utf-8') | |
#sitesinfo[key][i] = mn.group('uv') | |
#print i+"uv:"+mn.group('uv') | |
#print mn | |
if(mn): | |
#siteinfo[i.replace('星期六','').replace('星期天','')]= | |
#print i,'\t',mn.group('snum'),'uv:',mn.group('uv') | |
uvlist[i.replace('星期六','').replace('星期天','')]=int(mn.group('uv')) | |
#print uvlist | |
#print 'max:',self.sort_by_value(uvlist)[-1],'min:',self.sort_by_value(uvlist)[0] | |
return [self.sort_by_value(uvlist)[-1],self.sort_by_value(uvlist)[0]] | |
def sort_by_value(self,d): | |
return sorted(d.items(), key=lambda d:d[1]) | |
''' | |
if __name__ == '__main__': | |
CnzzTool=cnzz('cnzzusername','password','mygod') | |
if(CnzzTool.login()): | |
print "LoginOk" | |
#CnzzTool.getSiteListPageCount() | |
#CnzzTool.getSiteList() | |
#print CnzzTool.yesterdayinfo('2918848') | |
#CnzzTool.getSiteInfoByDate('2918848','2012-05-20','2012-05-24') | |
else: | |
print "LoginError" | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
老版本的cnzz api 已过期