Skip to content

Instantly share code, notes, and snippets.

@kenee
Created March 29, 2016 13:06
Show Gist options
  • Save kenee/ac00c0fe4ad3b30b4e36 to your computer and use it in GitHub Desktop.
Save kenee/ac00c0fe4ad3b30b4e36 to your computer and use it in GitHub Desktop.
get website googlecache
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
import socket
import os
import time
import random
import requests
from bs4 import BeautifulSoup
from urlparse import urlparse
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
socket.setdefaulttimeout(30)
#adjust the site here
search_site="info.wdwd.com"
search_term="site:" + search_site
import requests.packages.urllib3.util.ssl_
#print(requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS)
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'
headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ",
"Accept-Encoding":"text/html",
"Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Content-Type":"application/x-www-form-urlencoded",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0",
"Referer": "https://www.google.com.hk"
}
ss = requests.session()
ss.headers.update(headers)
def getSaveName(url):
o = urlparse(url)
if o.query:
return o.query.replace('=','_') + '.html'
else:
return 'index.html'
def main():
url = "https://www.google.com.hk/search?q=site:info.wdwd.com&safe=strict&biw=1278&bih=690&ei=-_T3VrbxNIej0QTa4ajwBw&start=110&sa=N"
#url = "https://www.google.com.hk/search?q=site:info.wdwd.com&safe=strict&biw=1280&bih=648&ei=Wef3VoCnMeTsmAXN07ugCQ&start=200&sa=N"
#this is the directory we will save files to
try:
os.mkdir(search_site)
except:
pass
counter = 0
pagenum = 0
more = True
while(more):
pagenum += 1
print "PAGE "+str(pagenum)+": " + url
r = ss.get(url)
#r.encoding = 'utf-8'
page = r.text
f = open('ooo.html','w')
f.write(page)
f.close()
#page = open('index.html').read()
soup = BeautifulSoup(page,"lxml")
#print soup
items = {}
for rc in soup.find_all(attrs={'class':'rc'}):
#print rc
try:
cache_href = rc.find(attrs={'class': 'fl'}).get('href')
except:
continue
act_href = rc.find(attrs={'class':'_Rm'}).get_text().strip()
print "Get %s" % act_href
tmp_page = requests.get(cache_href).text
savename = getSaveName(act_href)
f = open(search_site + "/" + savename,'w')
f.write(tmp_page)
f.close()
#comment out the code below if you expect to crawl less than 50 pages
random_interval=random.randrange(30,40,1)
print "sleeping for: " + str(random_interval) + " seconds"
time.sleep(random_interval)
#now check if there is more pages
pnnext = soup.find("a", {"id": "pnnext"})
if pnnext == None:
more = False
else:
url = "https://www.google.com.hk" + pnnext.get('href')
if __name__=="__main__":
main()
# vim: ai ts=4 sts=4 et sw=4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment