KayneWest · December 3, 2014 18:34
diff --git a/MechBrowser.py b/MechBrowser.py
 import mechanize
 import cookielib
 import urlparse
 import re
 import time
 import random
 import csv
 import pandas as pd
 import pickle
 import random
 import datetime
 import os

 user_agents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
                    'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
                    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
                    'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
                    'Mozilla/4.61 [ja] (X11; I; Linux 2.6.13-33cmc1 i686)',
                    'Opera/9.63 (X11; Linux x86_64; U; ru) Presto/2.1.1',
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
                    'Opera/9.25 (Windows NT 5.1; U; en)',
                    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
                    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19',
                    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20100121 Firefox/3.5.6 Wyzo/3.5.6.1'
 ]

 #DOWNLOAD DIRECTORY
 DIRECTORY='YOUR DIRECTORY HERE'

 def log(msg):
 	print("{} {}".format(str(datetime.datetime.now()), msg))

 class MiniBatchIterator(object):
 	''' mini-batch iterator '''
 	def __init__(self, x, batch_size=4):
 		self.x = x
 		self.batch_size = batch_size
     
 	def __iter__(self):
 		n_samples = len(self.x)
 		for i in xrange((n_samples + self.batch_size - 1)/ self.batch_size):
 			yield (self.x[i*self.batch_size:(i+1)*self.batch_size])


 class TrendBrowser(object):

 	''' browser must have vaild gmail account, valid gmail password, a list of user agents, 
 		and a valid directory'''

 	def __init__(self,gmail_account,gmail_password,user_agents=user_agents,directory=DIRECTORY,
 					listofstuff=None,dictionary=None,dataframe=None):
 		#time.sleep(random.randint(0,15))
 		#Create the basic browser object
 		os.chdir(DIRECTORY)
 		self.directory=DIRECTORY


 		if dictionary==None:
 			self.dic={}
 			self.max_value = None
 		else:
 			self.dic=dictionary
 			self.max_value=max(self.dic, key=self.dic.get)

 		#if you had to stop and had an existing dataframe, 
 		#this will help
 		if isinstance(dictionary,pd.DataFrame):
 			self.df=dataframe
 		else:
 			self.df=pd.DataFrame() #blank dataframe

 		self.LIST=listofstuff
 		self.error_log={}

 		#adds user agents
 		self.user_agents=user_agents

 		self.browser = mechanize.Browser()
 		#Create a handler for cookies, this class can load and save cookies
 		cookies = cookielib.LWPCookieJar()
 		#Add it to browser
 		self.browser.set_cookiejar(cookies)
 		#Ignore robots.txt, so we don't miss anything while scraping
 		self.browser.set_handle_robots(False)
 		#Allow refresh redirections
 		self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

 		#Add a user agent header to our browser
 		#if you want proxies that work, this may do the trick
 		#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} )
 		#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))]
 		self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])]

 		response = self.browser.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/')
 		forms = mechanize.ParseResponse(response)
 		form = forms[0]
 		form['Email'] = gmail_account
 		form['Passwd'] = gmail_password
 		response = self.browser.open(form.click())


 	#the following three functions are basic functions to simply query google
 	#trends for trend information 	    
 	def get(self,website):
 		''' this will get the html of any website you wish to go to'''
 		source = self.browser.open(website).read()
 		return source

 	def trends_query(self,terms=[]): #hard limit on 5 search items
 		'''this function is exclusive to google trends'''

 		if len(terms)>5:
 			return 'can only search for 5 items at a time'
 		else:
 			strings=",".join(terms)
 			query1='https://www.google.com/trends/trendsReport?hl=en-US&q='+strings+'&content=1&export=1'
 			#this downloads the file
 			self.browser.open(query1).read()
 			return self.browser.open(query1).read()

 	def trend_to_pandas(self, filen):
 		repo=filen.split('\n')
 		if repo!=[]:
 			repo=repo[4:575]
 			repo_columns=[x for x in repo[0]]
 			self.df=pd.DataFrame(repo[1:],columns=repo_columns)
 			#f.close()
 			#convert columns to floats
 			for column in self.df.columns[1:]: #skip the week column
 				self.df[column]=self.df[column].apply(lambda x: float(x))
 			return self.df
 		else:
 			log('error in trend_to_pandas')


 	def START(self, filen):
 		repo=filen.split('\n')
 		if repo!=[]:
 			repo=repo[4:575]
 			repo_columns=[x for x in repo[0]]
 			self.df=pd.DataFrame(repo[1:],columns=repo_columns)
 			#f.close()
 			#convert columns to floats
 			for column in self.df.columns[1:]: #skip the week column
 				self.df[column]=self.df[column].apply(lambda x: float(x))
 			for i in self.df.columns[1:]: #skip the week column
 				self.dic[i]=self.df[i].max()
 			self.max_value=max(self.dic, key=self.dic.get)  
 			#save the dictionary after every iteration
 			pickle.dump(self.dic,open('dic.p','wb'))
 			pickle.dump(self.df,open('df.p','wb'))  
 		else:
 			num=len(self.error_log)
 			self.error_log[num]=minibatch
 			log('error in START, look at error log')
 			pass

 	def UPDATER(self, filen):
 		repo=filen.split('\n')
 		if repo!=[]:
 			repo=repo[4:575]
 			repo_columns=[x for x in repo[0]]
 			updatedf=pd.DataFrame(repo[1:],columns=repo_columns)
 			updates={}
 			#convert columns to floats
 			for column in updatedf.columns[1:]:
 				updatedf[column]=updatedf[column].apply(lambda x: float(x))
 			for i in updatedf.columns[1:]: #skip the week column
 				updates[i]=updatedf[i].max()
 			minibatch_max_value=max(updates, key=updates.get)

 			if minibatch_max_value!=self.max_value:
 				old_max=self.max_value #for the updating after the for loop
 				for k,v in self.dic.items():
 					if k==self.max_value: 
 						#update dataframe
 						self.df[self.max_value]=updatedf[self.max_value]
 						#update max dictionary
 						self.dic[self.max_value]=updates[self.max_value]
 					else:
 						#update dataframe
 						factor=self.dic[k]/updates[self.max_value]
 						self.df[k]*factor
 						#update max dictionary
 						new_value=factor*self.dic[k]
 						self.dic[k]=new_value

 				#update the max-value
 				self.max_value=minibatch_max_value
 				for k,v in updates.items():
 					if k==old_max: #already updated, don't need to do it again.
 						pass
 					else:
 						self.dic[k]=v

 			else:
 				#only need to update the dic item normally 
 				for k,v in updates.items():
 					#update max dictionary
 					self.dic[k]=v

 			for x in updatedf.columns[1:]:					
 				self.df[x]=updatedf[x]

 			#self.df.append(updates)    
 			#save pickle file
 			pickle.dump(self.dic,open('dic.p','wb'))
 			pickle.dump(self.df,open('df.p','wb')) 
 		else:
 			num=len(self.error_log)
 			self.error_log[num]=minibatch
 			log('ERROR in UPDATER, check error log')
 			pass

 	def start_process(self):

 		'''only use this method if you're starting the process
 		otherwise DON'T use it as it will fuck with your list.'''

 		New=random.sample(self.LIST,5)
 		for item in New:
 		    self.LIST.remove(item)

 		#process 2: create minibatches
 		self.minibatches=MiniBatchIterator(self.LIST)

 		#wait between 5,10 seconds for the item to download
 		#before starting with the next process. 
 		time.sleep(random.randint(15,20))  

 		log('searching for '+str(New))
 		self.START(self.trends_query(terms=New))

 	def create_minibatches(self):
 		#only use this minibatch creater if you DID NOT use the start_process functions
 		self.minibatches=MiniBatchIterator(self.LIST)

 	def updater_process(self):
 		for minibatch in self.minibatches:
 			#sleep between 1.5 - 2 min per download #this way you don't hit their 
 			#search quotas
 			time.sleep(random.randint(90,120)) 
 			
 			updates={}

 			#sleep funciton is important, need to wait for item to download
 			time.sleep(random.randint(15,20))
 			log('searching for '+str(self.max_value)+','+str(minibatch))
 			self.UPDATER(self.trends_query(terms=minibatch+[self.max_value]))
 			if abs(random.gauss(0,1))<0.1:
 				[self.random_wiki() for n_time in range(5,15)]
 		return self.dic

 	#goes to a random page of wikipedia, then waits random time
 	#to again fool das google
 	def random_wiki(self):
 		self.get('http://en.wikipedia.org/wiki/Main_page')
 		self.browser.get('http://en.wikipedia.org/wiki/Special:Random')
 		time.sleep(abs(random.gauss(10,3)))

 	def pd_sorter(self):
 		#TODO
 		#can only be used when top item is found.
 		return ''


 if __name__ == "__main__":
 	import pickle
 	os.chdir(DIRECTORY)
 	SEARCH_TERMS=pickle.load(open('search.p'))
 	random.shuffle(SEARCH_TERMS)
 	browser=TrendBrowser(gmail_account='EMAIL',gmail_password='PASSWORD',user_agents=user_agents,directory=DIRECTORY,
 						listofstuff=SEARCH_TERMS,dictionary=None,dataframe=None)
 	print 'loading the sorter'
 	#sorter=Sorter(list_of_terms,browser)
 	print 'starting the process'
 	browser.starter()
 	time.sleep(5)
 	print 'updating the searcher, this will take some time'
 	browser.updater_process()
	import mechanize
	import cookielib
	import urlparse
	import re
	import time
	import random
	import csv
	import pandas as pd
	import pickle
	import random
	import datetime
	import os

	user_agents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
	'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36',
	'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
	'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0',
	'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
	'Mozilla/4.61 [ja] (X11; I; Linux 2.6.13-33cmc1 i686)',
	'Opera/9.63 (X11; Linux x86_64; U; ru) Presto/2.1.1',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
	'Opera/9.25 (Windows NT 5.1; U; en)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
	'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
	'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19',
	'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20100121 Firefox/3.5.6 Wyzo/3.5.6.1'
	]

	#DOWNLOAD DIRECTORY
	DIRECTORY='YOUR DIRECTORY HERE'

	def log(msg):
	print("{} {}".format(str(datetime.datetime.now()), msg))

	class MiniBatchIterator(object):
	''' mini-batch iterator '''
	def __init__(self, x, batch_size=4):
	self.x = x
	self.batch_size = batch_size

	def __iter__(self):
	n_samples = len(self.x)
	for i in xrange((n_samples + self.batch_size - 1)/ self.batch_size):
	yield (self.x[iself.batch_size:(i+1)self.batch_size])


	class TrendBrowser(object):

	''' browser must have vaild gmail account, valid gmail password, a list of user agents,
	and a valid directory'''

	def __init__(self,gmail_account,gmail_password,user_agents=user_agents,directory=DIRECTORY,
	listofstuff=None,dictionary=None,dataframe=None):
	#time.sleep(random.randint(0,15))
	#Create the basic browser object
	os.chdir(DIRECTORY)
	self.directory=DIRECTORY


	if dictionary==None:
	self.dic={}
	self.max_value = None
	else:
	self.dic=dictionary
	self.max_value=max(self.dic, key=self.dic.get)

	#if you had to stop and had an existing dataframe,
	#this will help
	if isinstance(dictionary,pd.DataFrame):
	self.df=dataframe
	else:
	self.df=pd.DataFrame() #blank dataframe

	self.LIST=listofstuff
	self.error_log={}

	#adds user agents
	self.user_agents=user_agents

	self.browser = mechanize.Browser()
	#Create a handler for cookies, this class can load and save cookies
	cookies = cookielib.LWPCookieJar()
	#Add it to browser
	self.browser.set_cookiejar(cookies)
	#Ignore robots.txt, so we don't miss anything while scraping
	self.browser.set_handle_robots(False)
	#Allow refresh redirections
	self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

	#Add a user agent header to our browser
	#if you want proxies that work, this may do the trick
	#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} )
	#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))]
	self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])]

	response = self.browser.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/')
	forms = mechanize.ParseResponse(response)
	form = forms[0]
	form['Email'] = gmail_account
	form['Passwd'] = gmail_password
	response = self.browser.open(form.click())


	#the following three functions are basic functions to simply query google
	#trends for trend information
	def get(self,website):
	''' this will get the html of any website you wish to go to'''
	source = self.browser.open(website).read()
	return source

	def trends_query(self,terms=[]): #hard limit on 5 search items
	'''this function is exclusive to google trends'''

	if len(terms)>5:
	return 'can only search for 5 items at a time'
	else:
	strings=",".join(terms)
	query1='https://www.google.com/trends/trendsReport?hl=en-US&q='+strings+'&content=1&export=1'
	#this downloads the file
	self.browser.open(query1).read()
	return self.browser.open(query1).read()

	def trend_to_pandas(self, filen):
	repo=filen.split('\n')
	if repo!=[]:
	repo=repo[4:575]
	repo_columns=[x for x in repo[0]]
	self.df=pd.DataFrame(repo[1:],columns=repo_columns)
	#f.close()
	#convert columns to floats
	for column in self.df.columns[1:]: #skip the week column
	self.df[column]=self.df[column].apply(lambda x: float(x))
	return self.df
	else:
	log('error in trend_to_pandas')


	def START(self, filen):
	repo=filen.split('\n')
	if repo!=[]:
	repo=repo[4:575]
	repo_columns=[x for x in repo[0]]
	self.df=pd.DataFrame(repo[1:],columns=repo_columns)
	#f.close()
	#convert columns to floats
	for column in self.df.columns[1:]: #skip the week column
	self.df[column]=self.df[column].apply(lambda x: float(x))
	for i in self.df.columns[1:]: #skip the week column
	self.dic[i]=self.df[i].max()
	self.max_value=max(self.dic, key=self.dic.get)
	#save the dictionary after every iteration
	pickle.dump(self.dic,open('dic.p','wb'))
	pickle.dump(self.df,open('df.p','wb'))
	else:
	num=len(self.error_log)
	self.error_log[num]=minibatch
	log('error in START, look at error log')
	pass

	def UPDATER(self, filen):
	repo=filen.split('\n')
	if repo!=[]:
	repo=repo[4:575]
	repo_columns=[x for x in repo[0]]
	updatedf=pd.DataFrame(repo[1:],columns=repo_columns)
	updates={}
	#convert columns to floats
	for column in updatedf.columns[1:]:
	updatedf[column]=updatedf[column].apply(lambda x: float(x))
	for i in updatedf.columns[1:]: #skip the week column
	updates[i]=updatedf[i].max()
	minibatch_max_value=max(updates, key=updates.get)

	if minibatch_max_value!=self.max_value:
	old_max=self.max_value #for the updating after the for loop
	for k,v in self.dic.items():
	if k==self.max_value:
	#update dataframe
	self.df[self.max_value]=updatedf[self.max_value]
	#update max dictionary
	self.dic[self.max_value]=updates[self.max_value]
	else:
	#update dataframe
	factor=self.dic[k]/updates[self.max_value]
	self.df[k]*factor
	#update max dictionary
	new_value=factor*self.dic[k]
	self.dic[k]=new_value

	#update the max-value
	self.max_value=minibatch_max_value
	for k,v in updates.items():
	if k==old_max: #already updated, don't need to do it again.
	pass
	else:
	self.dic[k]=v

	else:
	#only need to update the dic item normally
	for k,v in updates.items():
	#update max dictionary
	self.dic[k]=v

	for x in updatedf.columns[1:]:
	self.df[x]=updatedf[x]

	#self.df.append(updates)
	#save pickle file
	pickle.dump(self.dic,open('dic.p','wb'))
	pickle.dump(self.df,open('df.p','wb'))
	else:
	num=len(self.error_log)
	self.error_log[num]=minibatch
	log('ERROR in UPDATER, check error log')
	pass

	def start_process(self):

	'''only use this method if you're starting the process
	otherwise DON'T use it as it will fuck with your list.'''

	New=random.sample(self.LIST,5)
	for item in New:
	self.LIST.remove(item)

	#process 2: create minibatches
	self.minibatches=MiniBatchIterator(self.LIST)

	#wait between 5,10 seconds for the item to download
	#before starting with the next process.
	time.sleep(random.randint(15,20))

	log('searching for '+str(New))
	self.START(self.trends_query(terms=New))

	def create_minibatches(self):
	#only use this minibatch creater if you DID NOT use the start_process functions
	self.minibatches=MiniBatchIterator(self.LIST)

	def updater_process(self):
	for minibatch in self.minibatches:
	#sleep between 1.5 - 2 min per download #this way you don't hit their
	#search quotas
	time.sleep(random.randint(90,120))

	updates={}

	#sleep funciton is important, need to wait for item to download
	time.sleep(random.randint(15,20))
	log('searching for '+str(self.max_value)+','+str(minibatch))
	self.UPDATER(self.trends_query(terms=minibatch+[self.max_value]))
	if abs(random.gauss(0,1))<0.1:
	[self.random_wiki() for n_time in range(5,15)]
	return self.dic

	#goes to a random page of wikipedia, then waits random time
	#to again fool das google
	def random_wiki(self):
	self.get('http://en.wikipedia.org/wiki/Main_page')
	self.browser.get('http://en.wikipedia.org/wiki/Special:Random')
	time.sleep(abs(random.gauss(10,3)))

	def pd_sorter(self):
	#TODO
	#can only be used when top item is found.
	return ''


	if __name__ == "__main__":
	import pickle
	os.chdir(DIRECTORY)
	SEARCH_TERMS=pickle.load(open('search.p'))
	random.shuffle(SEARCH_TERMS)
	browser=TrendBrowser(gmail_account='EMAIL',gmail_password='PASSWORD',user_agents=user_agents,directory=DIRECTORY,
	listofstuff=SEARCH_TERMS,dictionary=None,dataframe=None)
	print 'loading the sorter'
	#sorter=Sorter(list_of_terms,browser)
	print 'starting the process'
	browser.starter()
	time.sleep(5)
	print 'updating the searcher, this will take some time'
	browser.updater_process()