Last active
April 1, 2021 18:14
-
-
Save pablohfreitas/f445c7790e36542d6437d17cd6c264ed to your computer and use it in GitHub Desktop.
PLScrap_scraping_matches.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Cria uma lista com os links de cada partida | |
list_mlinks = mlinks['link_match'] | |
# Cria dois objetos vazios, onde armazenaremos os resultados obtidos | |
data_matches = [] | |
data_h2h = [] | |
# Inicia o loop para percorrer todos os links | |
for link in tqdm(list_mlinks): | |
wd.get(link) #acessa o link da partida | |
time.sleep(1) #comando para aguardar a página carregar (1 segundo) | |
# Cria uma variável que vai receber informações da página do jogo, como nome das equipes, resultado, etc. | |
home_team = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[3]/div/div/div[1]/div[1]/a[2]/span[1]').text | |
away_team = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[3]/div/div/div[1]/div[3]/a[2]/span[1]').text | |
result_full = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[3]/div/div/div[1]/div[2]/div/div').text | |
result_ht = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[3]/div/div/div[2]/div').text | |
season = mlinks.loc[(mlinks['link_match'] == link),['season']].iloc[0]['season'] | |
date = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[1]/div/div[1]/div[1]').text | |
link_match = link | |
# Clica na aba de Stats da partida | |
wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[1]/div/div/ul/li[3]').click() | |
########################################################################################################################### | |
# Clica na sub-seção Stats Match | |
wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[1]/ul/ul/li[2]').click() | |
time.sleep(1) | |
# Recebe o elemento da página que contém todos os elementos de scouts da partida | |
stats = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[2]/table/tbody') | |
stats2 = stats.find_elements_by_tag_name('tr') | |
# Loop que vai selecionar cada elemento de scouts da partida e salvar em suas respectivas variáveis | |
for s in stats2: | |
scout = s.find_elements_by_tag_name('td')[1].text | |
scout_home = s.find_elements_by_tag_name('td')[0].text | |
scout_away = s.find_elements_by_tag_name('td')[2].text | |
# Estrutura temporário de dados que vai armazenar em cada linha um scout juntamente com as demais informações da partida. | |
temp_data = {'season': season, | |
'date': date, | |
'home_team': home_team, | |
'away_team': away_team, | |
'result_full': result_full, | |
'result_ht': result_ht, | |
'scout': scout, | |
'scout_home': scout_home, | |
'scout_away': scout_away, | |
'link_match': link_match} | |
# Insere os dados temporários no objeto final com todas as partidas. | |
data_matches.append(temp_data) | |
# Clica na sub-seção Head to Head | |
wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[1]/ul/ul/li[1]').click() | |
# Cria uma variável que vai receber informações do histórico de confrontos das equipes. | |
h2h_played = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[3]/p[2]').text | |
h2h_draw = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[3]/p[3]/span').text | |
h2h_home_total_wins = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[1]/div[1]/div[1]/div[2]').text | |
h2h_away_total_wins = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[2]/div[1]/div[1]/div[2]').text | |
h2h_home_wins_home = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[1]/div[2]/div[1]/div[2]').text | |
h2h_away_wins_home = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[2]/div[2]/div[1]/div[2]').text | |
h2h_home_wins_away = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[1]/div[3]/div[1]/div[2]').text | |
h2h_away_wins_away = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[2]/div[3]/div[1]/div[2]').text | |
# Estrutura temporário de dados que vai armazenar em cada linha as informações dos confrontos entre as equipes. | |
temp_data2 = {'link_match': link_match, | |
'h2h_played': h2h_played, | |
'h2h_draw': h2h_draw, | |
'h2h_home_total_wins': h2h_home_total_wins, | |
'h2h_away_total_wins': h2h_away_total_wins, | |
'h2h_home_wins_home': h2h_home_wins_home, | |
'h2h_away_wins_home': h2h_away_wins_home, | |
'h2h_home_wins_away': h2h_home_wins_away, | |
'h2h_away_wins_away': h2h_away_wins_away} | |
# Insere os dados temporários no objeto final com todas as partidas. | |
data_h2h.append(temp_data2) | |
# Criando um DataFrame para cada uma das duas tabelas com os dados coletados | |
df_matches = pd.DataFrame(data_matches) | |
df_h2h = pd.DataFrame(data_h2h) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment