jq2 · February 1, 2018 07:33
diff --git a/chrome_remote_dbg.py b/chrome_remote_dbg.py
 #!/usr/bin/env python
 # @filename: chrome_remote_dbg.py
 # @author: NullDotDEV
 # @description: Download and parse Google search results from the pipeline command-line.
 # @last-updated: Thu Feb  1 05:31:28 -02 2018
 # ===============================================================================
 # HowTo: Using this script.
 # Using this script is very simple, just type:
 # Example: $ python chrome_remote_dbg.py > list_of_google-search-results-dataset.txt


 import os,sys,requests
 from selenium import webdriver
 from selenium.webdriver import Chrome

 base_url = 'https://duckduckgo.org'
 chromium_driver_path = '/usr/lib/bin/chromium/chromedriver'



 class Goo():
    def __init__(self, host, port):
        self._host = host
        self._port = port
        print('An instance of the Goo class was created\n\
            Details os this instance:\n\
            host:%s\n\
            port:%s' % (host, port)
        )
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--disable-gpu")
        #self.options.add_argument("--remote-debugging-port=9222")
        self.options.add_argument("--screen-size=1200x800")
        self.chrome_drv = webdriver.Chrome(executable_path='/usr/lib/chromium/chromedriver', chrome_options=self.options)
        self._page_source = None


    def add_option(self, option):
        self.options.add_argument(option)
        print('Seting up chromium webdriver options')


    def get_page(self, page):
        print('Starting...')
        # driver = webdriver.Remote(command_executor=base_url, desired_capabilities=options.to_capabilities())
        self.chrome_drv.get(page)
        #print('OKAY, GET')
        self._page_source = self.chrome_drv.page_source
        #print('The source page lenght is: %s' % len(self._page_source))
        #print('EOF')

        temp_data = self.chrome_drv.find_elements_by_tag_name('h3')
        span_tag = self.chrome_drv.find_elements_by_class_name('st')
        ahref_urls = self.chrome_drv.find_elements_by_class_name('_Rm')

        #for h3_title, span_st, href_url in zip(temp_data, span_tag, ahref_urls):
        #print('TITLE:%s\nDESC:%s\nURL:%s\n\n' % (h3_title.text, span_st.text, href_url.text))
        #return self._page_source
        return (temp_data, span_tag, ahref_urls)


    def go(self, url):
              self.chrome_drv.get(url)


 """
 >>> import os,sys,requests
 >>> from bs4 import BeautifulSoup
 >>> from selenium import webdriver
 self.options = webdriver.ChromeOptions()
 >>> chrome_drv = webdriver.Chrome(executable_path='chromedriver', chrome_options=options,\
 service_args=['--dump-dom', 'https://httpbin.org'])

 >>> chrome_drv.current_url
 >>> data = chrome_drv.page_source
 >>> soup = BeautifulSoup(data, 'lxml')
 >>> type(data)
 >>> chrome_drv.get('https://cse.google.com/?q=remix&cx=<CX_TOKEN>:<CX_TOKEN>')
 >>> chrome_drv.title
 >>> data = chrome_drv.page_source
 >>> soup = BeautifulSoup(data, 'lxml')
 >>> for anchor in soup.find_all('a'):
 	print(anchor.text)
 	#s_anchors.append(anchor)

 for anchor in soup.find_all('a'):
 	tags = ['http://cse', 'https://code']
 	if anchor.has_attr('href'):
 		for tag in tags:
 			if not anchor['href'].startswith('http://cse.')\
 			and not anchor['href'].startswith('http://code.'):
 				print(anchor['href'])
 	#s_anchors.append(anchor)




 from selenium.webdriver.common.keys import Keys
 # select the search box field on google search
 search_field.clear()
 search_field.send_keys('inurl:google')
 search_field.send_keys(Keys.ENTER)

 search_field = chrome_drv.find_element_by_css_selector('#lst-ib')
 # print text output if any
 next_page_field.text


 # google next page (2 of ???)
 next_page_field = chrome_drv.find_element_by_css_selector('#nav > tbody > tr > td:nth-child(2) > a')

 # google search results titles
 next_page_field = chrome_drv.find_element_by_css_selector('#rso > div:nth-child(1) > div > div:nth-child(1) > div > div > h3 > a')
 for i in range(2, 99):
 	print(chrome_drv.find_element_by_css_selector('#rso > div:nth-child(2) > div > div:nth-child(' + str(i) + ') > div > div > h3 > a').text)



 #filter/map

 >>> list_of_urls=[]
 >>> for i in range(1,100):
 	list_of_urls.append(chrome_drv.find_element_by_css_selector('#rso > div > div > div:nth-child(' + str(i) + ') > div > div > div > div > div > cite').text.split(sep='...')[0].split(sep=' ')[0])

 """



 def main():
    host = 'localhost'
    port = '9222'
    base_url = 'https://google.com.br/search?q=Google+Search&num=100'
    gg = Goo(host, port)
    gg.go('https://google.com.br/')
    g_data = gg.get_page(base_url)
    for title, desc, link in zip(g_data[0], g_data[1], g_data[2]):
        print('%s\n%s\n%s\n\n' % (title.text, desc.text, link.text))



 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# @filename: chrome_remote_dbg.py
	# @author: NullDotDEV
	# @description: Download and parse Google search results from the pipeline command-line.
	# @last-updated: Thu Feb 1 05:31:28 -02 2018
	# ===============================================================================
	# HowTo: Using this script.
	# Using this script is very simple, just type:
	# Example: $ python chrome_remote_dbg.py > list_of_google-search-results-dataset.txt


	import os,sys,requests
	from selenium import webdriver
	from selenium.webdriver import Chrome

	base_url = 'https://duckduckgo.org'
	chromium_driver_path = '/usr/lib/bin/chromium/chromedriver'



	class Goo():
	def __init__(self, host, port):
	self._host = host
	self._port = port
	print('An instance of the Goo class was created\n\
	Details os this instance:\n\
	host:%s\n\
	port:%s' % (host, port)
	)
	self.options = webdriver.ChromeOptions()
	self.options.add_argument('--headless')
	self.options.add_argument("--no-sandbox")
	self.options.add_argument("--disable-gpu")
	#self.options.add_argument("--remote-debugging-port=9222")
	self.options.add_argument("--screen-size=1200x800")
	self.chrome_drv = webdriver.Chrome(executable_path='/usr/lib/chromium/chromedriver', chrome_options=self.options)
	self._page_source = None


	def add_option(self, option):
	self.options.add_argument(option)
	print('Seting up chromium webdriver options')


	def get_page(self, page):
	print('Starting...')
	# driver = webdriver.Remote(command_executor=base_url, desired_capabilities=options.to_capabilities())
	self.chrome_drv.get(page)
	#print('OKAY, GET')
	self._page_source = self.chrome_drv.page_source
	#print('The source page lenght is: %s' % len(self._page_source))
	#print('EOF')

	temp_data = self.chrome_drv.find_elements_by_tag_name('h3')
	span_tag = self.chrome_drv.find_elements_by_class_name('st')
	ahref_urls = self.chrome_drv.find_elements_by_class_name('_Rm')

	#for h3_title, span_st, href_url in zip(temp_data, span_tag, ahref_urls):
	#print('TITLE:%s\nDESC:%s\nURL:%s\n\n' % (h3_title.text, span_st.text, href_url.text))
	#return self._page_source
	return (temp_data, span_tag, ahref_urls)


	def go(self, url):
	self.chrome_drv.get(url)


	"""
	>>> import os,sys,requests
	>>> from bs4 import BeautifulSoup
	>>> from selenium import webdriver
	self.options = webdriver.ChromeOptions()
	>>> chrome_drv = webdriver.Chrome(executable_path='chromedriver', chrome_options=options,\
	service_args=['--dump-dom', 'https://httpbin.org'])

	>>> chrome_drv.current_url
	>>> data = chrome_drv.page_source
	>>> soup = BeautifulSoup(data, 'lxml')
	>>> type(data)
	>>> chrome_drv.get('https://cse.google.com/?q=remix&cx=<CX_TOKEN>:<CX_TOKEN>')
	>>> chrome_drv.title
	>>> data = chrome_drv.page_source
	>>> soup = BeautifulSoup(data, 'lxml')
	>>> for anchor in soup.find_all('a'):
	print(anchor.text)
	#s_anchors.append(anchor)

	for anchor in soup.find_all('a'):
	tags = ['http://cse', 'https://code']
	if anchor.has_attr('href'):
	for tag in tags:
	if not anchor['href'].startswith('http://cse.')\
	and not anchor['href'].startswith('http://code.'):
	print(anchor['href'])
	#s_anchors.append(anchor)




	from selenium.webdriver.common.keys import Keys
	# select the search box field on google search
	search_field.clear()
	search_field.send_keys('inurl:google')
	search_field.send_keys(Keys.ENTER)

	search_field = chrome_drv.find_element_by_css_selector('#lst-ib')
	# print text output if any
	next_page_field.text


	# google next page (2 of ???)
	next_page_field = chrome_drv.find_element_by_css_selector('#nav > tbody > tr > td:nth-child(2) > a')

	# google search results titles
	next_page_field = chrome_drv.find_element_by_css_selector('#rso > div:nth-child(1) > div > div:nth-child(1) > div > div > h3 > a')
	for i in range(2, 99):
	print(chrome_drv.find_element_by_css_selector('#rso > div:nth-child(2) > div > div:nth-child(' + str(i) + ') > div > div > h3 > a').text)



	#filter/map

	>>> list_of_urls=[]
	>>> for i in range(1,100):
	list_of_urls.append(chrome_drv.find_element_by_css_selector('#rso > div > div > div:nth-child(' + str(i) + ') > div > div > div > div > div > cite').text.split(sep='...')[0].split(sep=' ')[0])

	"""



	def main():
	host = 'localhost'
	port = '9222'
	base_url = 'https://google.com.br/search?q=Google+Search&num=100'
	gg = Goo(host, port)
	gg.go('https://google.com.br/')
	g_data = gg.get_page(base_url)
	for title, desc, link in zip(g_data[0], g_data[1], g_data[2]):
	print('%s\n%s\n%s\n\n' % (title.text, desc.text, link.text))



	if __name__ == '__main__':
	main()