Last active
April 27, 2017 18:04
-
-
Save nenodias/9818e31d5542557e0f91392f50f708a4 to your computer and use it in GitHub Desktop.
Exemplo crowler selenium com python 3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| import selenium | |
| from pdb import set_trace | |
| from selenium import webdriver | |
| from selenium.webdriver.common.keys import Keys | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import TimeoutException | |
| from bs4 import BeautifulSoup | |
| # Abre o navegador | |
| browser = webdriver.Firefox() | |
| browser.wait = WebDriverWait(browser, 5) | |
| lista = ['bauru', 'taquaitinga', 'matao'] | |
| for item in lista: | |
| browser.get("https://www.youtube.com") | |
| caixa_pesquisa = browser.find_element_by_id('masthead-search-terms') | |
| botao_pesquisa = browser.find_element_by_id('search-btn') | |
| # Escreve no input | |
| caixa_pesquisa.send_keys(Keys.DELETE) | |
| caixa_pesquisa.send_keys(item) | |
| botao_pesquisa.click() | |
| primeiro = True | |
| for page in range(1, 6): | |
| # browser.find_elements_by_xpath()#Busca vários por XPath | |
| # browser.find_element_by_xpath()#Busca um elemento com XPath | |
| if not primeiro: | |
| # botao_next = browser.find_elements_by_xpath( | |
| # '//a[@data-link-type="next"]')[0]#Busca um elemento com XPath | |
| btn = browser.find_elements_by_xpath('//span[text()="Próximo »"]') | |
| if btn: | |
| botao_next = btn[0] | |
| # Busca um elemento com XPath | |
| botao_next.click() | |
| try: | |
| progress = browser.wait.until( | |
| EC.presence_of_element_located((By.ID, "progress")) | |
| ) | |
| browser.wait.until(EC.staleness_of(progress)) | |
| except TimeoutException: | |
| pass | |
| else: | |
| primeiro = False | |
| soap = BeautifulSoup(browser.page_source) | |
| soap.prettify() | |
| table_res = soap.find('div', {"class": "yt-card"}) | |
| for link in table_res('a', {'rel': 'spf-prefetch'}): | |
| print(link.text) | |
| print("\n") | |
| print(link['href']) | |
| print("\n") | |
| print('-' * 80) | |
| print("\n") | |
| # Fecha o navegador | |
| browser.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment