Created
April 26, 2014 05:47
-
-
Save Harryyan/11312714 to your computer and use it in GitHub Desktop.
模拟chrome浏览器搜索关键字,返回第一页url
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#coding:utf-8 | |
import urllib,urllib2,cookielib,re,sys,os,time,random | |
cj = cookielib.CookieJar() | |
str1 = 'Apple-Mon identifiant Apple' #0 | |
str2 = 'Woolworths - Customer Satisfaction Survey'#0 | |
vibramkey = [] | |
beatskey=[] | |
vibramkey.append(str1) | |
beatskey.append(str2) | |
vibramweb=['mugentrading.net'] | |
beatsweb=['vafaelion.com'] | |
#allweb=['vibramweb','beatsweb'] | |
def searchkey(key,start,keydict,times): | |
url="http://www.google.com/search?hl=en&q=%s&revid=33815775&sa=X&ei=X6CbT4GrIoOeiQfth43GAw&ved=0CIgBENUCKAY&start=%s" %(key,start) | |
try: | |
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
opener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
urllib2.install_opener(opener) | |
req=urllib2.Request(url) | |
response =urllib2.urlopen(req) | |
content = response.read() | |
f=open('google','w') | |
f.write(content) | |
os.system("grep -iop '<cite>.*</cite>' google | sed 's/^<cite>//'>tmp") | |
tiqu=os.popen("sed 's/<\/cite>$//' tmp").readlines() | |
except: | |
changeip() | |
else: | |
total_size = sum(keydict[key]) | |
print keydict[key] | |
print "the size is: ",total_size | |
for domain in pinpai: | |
print "domain is:", domain | |
a=1 | |
for topDomain in tiqu: | |
real=topDomain.find(domain) | |
if real>0: | |
if start==0: | |
page=1 | |
elif start==10: | |
page=2 | |
elif start==20: | |
page=3 | |
elif start==30: | |
page=4 | |
else: | |
page=5 | |
lastkey=key.replace("+"," ") | |
temp = a | |
a = a + sum(keydict[key][0:times]) | |
tmp = float(total_size + 1 -a)/total_size | |
xinxi="%s\t\t %s\t\t Page%s,rank%s,Rank Ratio:\t%f\n" %(topDomain,lastkey,page,a,tmp) | |
xinxifile=open('index3.html','a') | |
xinxifile.write(xinxi) | |
xinxifile.close() | |
a = temp | |
a = a+1 | |
def changeip(): | |
ip=random.randint(0,2) | |
de="route delete -host google.com" | |
add="route add -host google.com eth1:%s" %ip | |
os.system(de) | |
os.system(add) | |
print "changip to %s" %ip | |
def readUrl(filename,webSites): | |
myfile = open(filename) | |
for line in myfile.readlines(): | |
line=line.strip('\n') | |
webSites.append(line) | |
def getSize(key,start,totalCount): | |
url="http://www.google.com/search?hl=en&q=%s&revid=33815775&sa=X&ei=X6CbT4GrIoOeiQfth43GAw&ved=0CIgBENUCKAY&start=%s" %(key,start) | |
try: | |
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) | |
opener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
urllib2.install_opener(opener) | |
req=urllib2.Request(url) | |
response =urllib2.urlopen(req) | |
content = response.read() | |
f=open('google','w') | |
f.write(content) | |
os.system("grep -iop '<cite>.*</cite>' google | sed 's/^<cite>//'>tmp2") | |
tiqu=os.popen("sed 's/<\/cite>$//' tmp2 > tmp3").readlines() | |
print "i am here" | |
except: | |
changeip() | |
else: | |
totalCount.append(len(tiqu)) | |
if __name__ == '__main__': | |
pinpaiid=0 | |
keydict = {} | |
pageSearch = [0] | |
for key in vibramkey: | |
print key | |
totalCount = [] | |
for start in pageSearch: | |
getSize(key,start,totalCount) | |
keydict[key] = totalCount | |
# #search key | |
# for x in vibramkey: | |
# if pinpaiid == 0: | |
# pinpai=vibramweb | |
# elif pinpaiid == 1: | |
# pinpai=beatsweb | |
# pinpaiid=pinpaiid+1 | |
# for key in x: | |
# times = 0 | |
# for start in pageSearch: | |
# searchkey(key,start,keydict,times) | |
# times += 1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment