Created
February 1, 2018 07:33
-
-
Save jq2/0426f28c0d91c3720dc264e1a7c7e065 to your computer and use it in GitHub Desktop.
[BOT] Google search scraper (Chrome/headless)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# @filename: chrome_remote_dbg.py | |
# @author: NullDotDEV | |
# @description: Download and parse Google search results from the pipeline command-line. | |
# @last-updated: Thu Feb 1 05:31:28 -02 2018 | |
# =============================================================================== | |
# HowTo: Using this script. | |
# Using this script is very simple, just type: | |
# Example: $ python chrome_remote_dbg.py > list_of_google-search-results-dataset.txt | |
import os,sys,requests | |
from selenium import webdriver | |
from selenium.webdriver import Chrome | |
base_url = 'https://duckduckgo.org' | |
chromium_driver_path = '/usr/lib/bin/chromium/chromedriver' | |
class Goo(): | |
def __init__(self, host, port): | |
self._host = host | |
self._port = port | |
print('An instance of the Goo class was created\n\ | |
Details os this instance:\n\ | |
host:%s\n\ | |
port:%s' % (host, port) | |
) | |
self.options = webdriver.ChromeOptions() | |
self.options.add_argument('--headless') | |
self.options.add_argument("--no-sandbox") | |
self.options.add_argument("--disable-gpu") | |
#self.options.add_argument("--remote-debugging-port=9222") | |
self.options.add_argument("--screen-size=1200x800") | |
self.chrome_drv = webdriver.Chrome(executable_path='/usr/lib/chromium/chromedriver', chrome_options=self.options) | |
self._page_source = None | |
def add_option(self, option): | |
self.options.add_argument(option) | |
print('Seting up chromium webdriver options') | |
def get_page(self, page): | |
print('Starting...') | |
# driver = webdriver.Remote(command_executor=base_url, desired_capabilities=options.to_capabilities()) | |
self.chrome_drv.get(page) | |
#print('OKAY, GET') | |
self._page_source = self.chrome_drv.page_source | |
#print('The source page lenght is: %s' % len(self._page_source)) | |
#print('EOF') | |
temp_data = self.chrome_drv.find_elements_by_tag_name('h3') | |
span_tag = self.chrome_drv.find_elements_by_class_name('st') | |
ahref_urls = self.chrome_drv.find_elements_by_class_name('_Rm') | |
#for h3_title, span_st, href_url in zip(temp_data, span_tag, ahref_urls): | |
#print('TITLE:%s\nDESC:%s\nURL:%s\n\n' % (h3_title.text, span_st.text, href_url.text)) | |
#return self._page_source | |
return (temp_data, span_tag, ahref_urls) | |
def go(self, url): | |
self.chrome_drv.get(url) | |
""" | |
>>> import os,sys,requests | |
>>> from bs4 import BeautifulSoup | |
>>> from selenium import webdriver | |
self.options = webdriver.ChromeOptions() | |
>>> chrome_drv = webdriver.Chrome(executable_path='chromedriver', chrome_options=options,\ | |
service_args=['--dump-dom', 'https://httpbin.org']) | |
>>> chrome_drv.current_url | |
>>> data = chrome_drv.page_source | |
>>> soup = BeautifulSoup(data, 'lxml') | |
>>> type(data) | |
>>> chrome_drv.get('https://cse.google.com/?q=remix&cx=<CX_TOKEN>:<CX_TOKEN>') | |
>>> chrome_drv.title | |
>>> data = chrome_drv.page_source | |
>>> soup = BeautifulSoup(data, 'lxml') | |
>>> for anchor in soup.find_all('a'): | |
print(anchor.text) | |
#s_anchors.append(anchor) | |
for anchor in soup.find_all('a'): | |
tags = ['http://cse', 'https://code'] | |
if anchor.has_attr('href'): | |
for tag in tags: | |
if not anchor['href'].startswith('http://cse.')\ | |
and not anchor['href'].startswith('http://code.'): | |
print(anchor['href']) | |
#s_anchors.append(anchor) | |
from selenium.webdriver.common.keys import Keys | |
# select the search box field on google search | |
search_field.clear() | |
search_field.send_keys('inurl:google') | |
search_field.send_keys(Keys.ENTER) | |
search_field = chrome_drv.find_element_by_css_selector('#lst-ib') | |
# print text output if any | |
next_page_field.text | |
# google next page (2 of ???) | |
next_page_field = chrome_drv.find_element_by_css_selector('#nav > tbody > tr > td:nth-child(2) > a') | |
# google search results titles | |
next_page_field = chrome_drv.find_element_by_css_selector('#rso > div:nth-child(1) > div > div:nth-child(1) > div > div > h3 > a') | |
for i in range(2, 99): | |
print(chrome_drv.find_element_by_css_selector('#rso > div:nth-child(2) > div > div:nth-child(' + str(i) + ') > div > div > h3 > a').text) | |
#filter/map | |
>>> list_of_urls=[] | |
>>> for i in range(1,100): | |
list_of_urls.append(chrome_drv.find_element_by_css_selector('#rso > div > div > div:nth-child(' + str(i) + ') > div > div > div > div > div > cite').text.split(sep='...')[0].split(sep=' ')[0]) | |
""" | |
def main(): | |
host = 'localhost' | |
port = '9222' | |
base_url = 'https://google.com.br/search?q=Google+Search&num=100' | |
gg = Goo(host, port) | |
gg.go('https://google.com.br/') | |
g_data = gg.get_page(base_url) | |
for title, desc, link in zip(g_data[0], g_data[1], g_data[2]): | |
print('%s\n%s\n%s\n\n' % (title.text, desc.text, link.text)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment