Created
December 3, 2014 18:34
-
-
Save KayneWest/811bcff97057589b27ca to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mechanize | |
import cookielib | |
import urlparse | |
import re | |
import time | |
import random | |
import csv | |
import pandas as pd | |
import pickle | |
import random | |
import datetime | |
import os | |
user_agents = ['Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', | |
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0', | |
'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36', | |
'Mozilla/4.61 [ja] (X11; I; Linux 2.6.13-33cmc1 i686)', | |
'Opera/9.63 (X11; Linux x86_64; U; ru) Presto/2.1.1', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10' | |
'Opera/9.25 (Windows NT 5.1; U; en)', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', | |
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', | |
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19', | |
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20100121 Firefox/3.5.6 Wyzo/3.5.6.1' | |
] | |
#DOWNLOAD DIRECTORY | |
DIRECTORY='YOUR DIRECTORY HERE' | |
def log(msg): | |
print("{} {}".format(str(datetime.datetime.now()), msg)) | |
class MiniBatchIterator(object): | |
''' mini-batch iterator ''' | |
def __init__(self, x, batch_size=4): | |
self.x = x | |
self.batch_size = batch_size | |
def __iter__(self): | |
n_samples = len(self.x) | |
for i in xrange((n_samples + self.batch_size - 1)/ self.batch_size): | |
yield (self.x[i*self.batch_size:(i+1)*self.batch_size]) | |
class TrendBrowser(object): | |
''' browser must have vaild gmail account, valid gmail password, a list of user agents, | |
and a valid directory''' | |
def __init__(self,gmail_account,gmail_password,user_agents=user_agents,directory=DIRECTORY, | |
listofstuff=None,dictionary=None,dataframe=None): | |
#time.sleep(random.randint(0,15)) | |
#Create the basic browser object | |
os.chdir(DIRECTORY) | |
self.directory=DIRECTORY | |
if dictionary==None: | |
self.dic={} | |
self.max_value = None | |
else: | |
self.dic=dictionary | |
self.max_value=max(self.dic, key=self.dic.get) | |
#if you had to stop and had an existing dataframe, | |
#this will help | |
if isinstance(dictionary,pd.DataFrame): | |
self.df=dataframe | |
else: | |
self.df=pd.DataFrame() #blank dataframe | |
self.LIST=listofstuff | |
self.error_log={} | |
#adds user agents | |
self.user_agents=user_agents | |
self.browser = mechanize.Browser() | |
#Create a handler for cookies, this class can load and save cookies | |
cookies = cookielib.LWPCookieJar() | |
#Add it to browser | |
self.browser.set_cookiejar(cookies) | |
#Ignore robots.txt, so we don't miss anything while scraping | |
self.browser.set_handle_robots(False) | |
#Allow refresh redirections | |
self.browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
#Add a user agent header to our browser | |
#if you want proxies that work, this may do the trick | |
#browser.set_proxies( {'http': proxies[random.randrange(0, len(proxies) )]} ) | |
#browser.addheaders = [('User-agent', ('Mozilla/5.0 (compatible; MSIE 9.0;','Windows NT 6.1; Trident/5.0)'))] | |
self.browser.addheaders = [('User-Agent', self.user_agents[random.randrange(0, len(self.user_agents) )])] | |
response = self.browser.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/') | |
forms = mechanize.ParseResponse(response) | |
form = forms[0] | |
form['Email'] = gmail_account | |
form['Passwd'] = gmail_password | |
response = self.browser.open(form.click()) | |
#the following three functions are basic functions to simply query google | |
#trends for trend information | |
def get(self,website): | |
''' this will get the html of any website you wish to go to''' | |
source = self.browser.open(website).read() | |
return source | |
def trends_query(self,terms=[]): #hard limit on 5 search items | |
'''this function is exclusive to google trends''' | |
if len(terms)>5: | |
return 'can only search for 5 items at a time' | |
else: | |
strings=",".join(terms) | |
query1='https://www.google.com/trends/trendsReport?hl=en-US&q='+strings+'&content=1&export=1' | |
#this downloads the file | |
self.browser.open(query1).read() | |
return self.browser.open(query1).read() | |
def trend_to_pandas(self, filen): | |
repo=filen.split('\n') | |
if repo!=[]: | |
repo=repo[4:575] | |
repo_columns=[x for x in repo[0]] | |
self.df=pd.DataFrame(repo[1:],columns=repo_columns) | |
#f.close() | |
#convert columns to floats | |
for column in self.df.columns[1:]: #skip the week column | |
self.df[column]=self.df[column].apply(lambda x: float(x)) | |
return self.df | |
else: | |
log('error in trend_to_pandas') | |
def START(self, filen): | |
repo=filen.split('\n') | |
if repo!=[]: | |
repo=repo[4:575] | |
repo_columns=[x for x in repo[0]] | |
self.df=pd.DataFrame(repo[1:],columns=repo_columns) | |
#f.close() | |
#convert columns to floats | |
for column in self.df.columns[1:]: #skip the week column | |
self.df[column]=self.df[column].apply(lambda x: float(x)) | |
for i in self.df.columns[1:]: #skip the week column | |
self.dic[i]=self.df[i].max() | |
self.max_value=max(self.dic, key=self.dic.get) | |
#save the dictionary after every iteration | |
pickle.dump(self.dic,open('dic.p','wb')) | |
pickle.dump(self.df,open('df.p','wb')) | |
else: | |
num=len(self.error_log) | |
self.error_log[num]=minibatch | |
log('error in START, look at error log') | |
pass | |
def UPDATER(self, filen): | |
repo=filen.split('\n') | |
if repo!=[]: | |
repo=repo[4:575] | |
repo_columns=[x for x in repo[0]] | |
updatedf=pd.DataFrame(repo[1:],columns=repo_columns) | |
updates={} | |
#convert columns to floats | |
for column in updatedf.columns[1:]: | |
updatedf[column]=updatedf[column].apply(lambda x: float(x)) | |
for i in updatedf.columns[1:]: #skip the week column | |
updates[i]=updatedf[i].max() | |
minibatch_max_value=max(updates, key=updates.get) | |
if minibatch_max_value!=self.max_value: | |
old_max=self.max_value #for the updating after the for loop | |
for k,v in self.dic.items(): | |
if k==self.max_value: | |
#update dataframe | |
self.df[self.max_value]=updatedf[self.max_value] | |
#update max dictionary | |
self.dic[self.max_value]=updates[self.max_value] | |
else: | |
#update dataframe | |
factor=self.dic[k]/updates[self.max_value] | |
self.df[k]*factor | |
#update max dictionary | |
new_value=factor*self.dic[k] | |
self.dic[k]=new_value | |
#update the max-value | |
self.max_value=minibatch_max_value | |
for k,v in updates.items(): | |
if k==old_max: #already updated, don't need to do it again. | |
pass | |
else: | |
self.dic[k]=v | |
else: | |
#only need to update the dic item normally | |
for k,v in updates.items(): | |
#update max dictionary | |
self.dic[k]=v | |
for x in updatedf.columns[1:]: | |
self.df[x]=updatedf[x] | |
#self.df.append(updates) | |
#save pickle file | |
pickle.dump(self.dic,open('dic.p','wb')) | |
pickle.dump(self.df,open('df.p','wb')) | |
else: | |
num=len(self.error_log) | |
self.error_log[num]=minibatch | |
log('ERROR in UPDATER, check error log') | |
pass | |
def start_process(self): | |
'''only use this method if you're starting the process | |
otherwise DON'T use it as it will fuck with your list.''' | |
New=random.sample(self.LIST,5) | |
for item in New: | |
self.LIST.remove(item) | |
#process 2: create minibatches | |
self.minibatches=MiniBatchIterator(self.LIST) | |
#wait between 5,10 seconds for the item to download | |
#before starting with the next process. | |
time.sleep(random.randint(15,20)) | |
log('searching for '+str(New)) | |
self.START(self.trends_query(terms=New)) | |
def create_minibatches(self): | |
#only use this minibatch creater if you DID NOT use the start_process functions | |
self.minibatches=MiniBatchIterator(self.LIST) | |
def updater_process(self): | |
for minibatch in self.minibatches: | |
#sleep between 1.5 - 2 min per download #this way you don't hit their | |
#search quotas | |
time.sleep(random.randint(90,120)) | |
updates={} | |
#sleep funciton is important, need to wait for item to download | |
time.sleep(random.randint(15,20)) | |
log('searching for '+str(self.max_value)+','+str(minibatch)) | |
self.UPDATER(self.trends_query(terms=minibatch+[self.max_value])) | |
if abs(random.gauss(0,1))<0.1: | |
[self.random_wiki() for n_time in range(5,15)] | |
return self.dic | |
#goes to a random page of wikipedia, then waits random time | |
#to again fool das google | |
def random_wiki(self): | |
self.get('http://en.wikipedia.org/wiki/Main_page') | |
self.browser.get('http://en.wikipedia.org/wiki/Special:Random') | |
time.sleep(abs(random.gauss(10,3))) | |
def pd_sorter(self): | |
#TODO | |
#can only be used when top item is found. | |
return '' | |
if __name__ == "__main__": | |
import pickle | |
os.chdir(DIRECTORY) | |
SEARCH_TERMS=pickle.load(open('search.p')) | |
random.shuffle(SEARCH_TERMS) | |
browser=TrendBrowser(gmail_account='EMAIL',gmail_password='PASSWORD',user_agents=user_agents,directory=DIRECTORY, | |
listofstuff=SEARCH_TERMS,dictionary=None,dataframe=None) | |
print 'loading the sorter' | |
#sorter=Sorter(list_of_terms,browser) | |
print 'starting the process' | |
browser.starter() | |
time.sleep(5) | |
print 'updating the searcher, this will take some time' | |
browser.updater_process() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment