kenee · March 29, 2016 13:06
diff --git a/get_googlecache.py b/get_googlecache.py
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 import urllib
 import urllib2
 import re
 import socket
 import os
 import time
 import random
 import requests
 from bs4 import BeautifulSoup
 from urlparse import urlparse
 import sys

 reload(sys)
 sys.setdefaultencoding('utf-8')

 socket.setdefaulttimeout(30)
 #adjust the site here
 search_site="info.wdwd.com"
 search_term="site:" + search_site

 import requests.packages.urllib3.util.ssl_
 #print(requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS)
 requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'

 headers = {
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ",
    "Accept-Encoding":"text/html",
    "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
    "Content-Type":"application/x-www-form-urlencoded",
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0",
    "Referer": "https://www.google.com.hk"
 }

 ss = requests.session()
 ss.headers.update(headers)

 def getSaveName(url):
    o = urlparse(url)
    if o.query:
        return o.query.replace('=','_') + '.html'
    else:
        return 'index.html'


 def main():
    url = "https://www.google.com.hk/search?q=site:info.wdwd.com&safe=strict&biw=1278&bih=690&ei=-_T3VrbxNIej0QTa4ajwBw&start=110&sa=N"
    #url = "https://www.google.com.hk/search?q=site:info.wdwd.com&safe=strict&biw=1280&bih=648&ei=Wef3VoCnMeTsmAXN07ugCQ&start=200&sa=N"
    #this is the directory we will save files to
    try:
        os.mkdir(search_site)
    except:
        pass
    counter = 0
    pagenum = 0
    more = True
     
    while(more):
        pagenum += 1
        print "PAGE "+str(pagenum)+": " + url
        r = ss.get(url)
        #r.encoding = 'utf-8'
        page = r.text
        f = open('ooo.html','w')
        f.write(page)
        f.close()
        #page = open('index.html').read()
        soup = BeautifulSoup(page,"lxml")
        #print soup
        items = {}

        for rc in soup.find_all(attrs={'class':'rc'}):
            #print rc
            try:
                cache_href = rc.find(attrs={'class': 'fl'}).get('href')
            except:
                continue
            act_href = rc.find(attrs={'class':'_Rm'}).get_text().strip()

            print "Get %s" % act_href
            tmp_page = requests.get(cache_href).text
            
            savename = getSaveName(act_href)
            f = open(search_site + "/" + savename,'w')
            f.write(tmp_page)
            f.close()
            #comment out the code below if you expect to crawl less than 50 pages
            random_interval=random.randrange(30,40,1)
            print "sleeping for: " + str(random_interval) + " seconds"
            time.sleep(random_interval)
        #now check if there is more pages
        pnnext = soup.find("a", {"id": "pnnext"})
        if pnnext == None:
            more = False
        else:
            url = "https://www.google.com.hk" + pnnext.get('href')
            
 
 if __name__=="__main__":
    main()
 
 # vim: ai ts=4 sts=4 et sw=4
	#!/usr/bin/python
	# -- coding: utf-8 --
	import urllib
	import urllib2
	import re
	import socket
	import os
	import time
	import random
	import requests
	from bs4 import BeautifulSoup
	from urlparse import urlparse
	import sys

	reload(sys)
	sys.setdefaultencoding('utf-8')

	socket.setdefaulttimeout(30)
	#adjust the site here
	search_site="info.wdwd.com"
	search_term="site:" + search_site

	import requests.packages.urllib3.util.ssl_
	#print(requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS)
	requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'

	headers = {
	"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8 ",
	"Accept-Encoding":"text/html",
	"Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
	"Content-Type":"application/x-www-form-urlencoded",
	"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0",
	"Referer": "https://www.google.com.hk"
	}

	ss = requests.session()
	ss.headers.update(headers)

	def getSaveName(url):
	o = urlparse(url)
	if o.query:
	return o.query.replace('=','_') + '.html'
	else:
	return 'index.html'


	def main():
	url = "https://www.google.com.hk/search?q=site:info.wdwd.com&safe=strict&biw=1278&bih=690&ei=-_T3VrbxNIej0QTa4ajwBw&start=110&sa=N"
	#url = "https://www.google.com.hk/search?q=site:info.wdwd.com&safe=strict&biw=1280&bih=648&ei=Wef3VoCnMeTsmAXN07ugCQ&start=200&sa=N"
	#this is the directory we will save files to
	try:
	os.mkdir(search_site)
	except:
	pass
	counter = 0
	pagenum = 0
	more = True

	while(more):
	pagenum += 1
	print "PAGE "+str(pagenum)+": " + url
	r = ss.get(url)
	#r.encoding = 'utf-8'
	page = r.text
	f = open('ooo.html','w')
	f.write(page)
	f.close()
	#page = open('index.html').read()
	soup = BeautifulSoup(page,"lxml")
	#print soup
	items = {}

	for rc in soup.find_all(attrs={'class':'rc'}):
	#print rc
	try:
	cache_href = rc.find(attrs={'class': 'fl'}).get('href')
	except:
	continue
	act_href = rc.find(attrs={'class':'_Rm'}).get_text().strip()

	print "Get %s" % act_href
	tmp_page = requests.get(cache_href).text

	savename = getSaveName(act_href)
	f = open(search_site + "/" + savename,'w')
	f.write(tmp_page)
	f.close()
	#comment out the code below if you expect to crawl less than 50 pages
	random_interval=random.randrange(30,40,1)
	print "sleeping for: " + str(random_interval) + " seconds"
	time.sleep(random_interval)
	#now check if there is more pages
	pnnext = soup.find("a", {"id": "pnnext"})
	if pnnext == None:
	more = False
	else:
	url = "https://www.google.com.hk" + pnnext.get('href')


	if __name__=="__main__":
	main()

	# vim: ai ts=4 sts=4 et sw=4