Skip to content

Instantly share code, notes, and snippets.

@pablohfreitas
Last active April 1, 2021 18:14
Show Gist options
  • Save pablohfreitas/f445c7790e36542d6437d17cd6c264ed to your computer and use it in GitHub Desktop.
Save pablohfreitas/f445c7790e36542d6437d17cd6c264ed to your computer and use it in GitHub Desktop.
PLScrap_scraping_matches.py
# Cria uma lista com os links de cada partida
list_mlinks = mlinks['link_match']
# Cria dois objetos vazios, onde armazenaremos os resultados obtidos
data_matches = []
data_h2h = []
# Inicia o loop para percorrer todos os links
for link in tqdm(list_mlinks):
wd.get(link) #acessa o link da partida
time.sleep(1) #comando para aguardar a página carregar (1 segundo)
# Cria uma variável que vai receber informações da página do jogo, como nome das equipes, resultado, etc.
home_team = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[3]/div/div/div[1]/div[1]/a[2]/span[1]').text
away_team = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[3]/div/div/div[1]/div[3]/a[2]/span[1]').text
result_full = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[3]/div/div/div[1]/div[2]/div/div').text
result_ht = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[3]/div/div/div[2]/div').text
season = mlinks.loc[(mlinks['link_match'] == link),['season']].iloc[0]['season']
date = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/section/div[1]/div/div[1]/div[1]').text
link_match = link
# Clica na aba de Stats da partida
wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[1]/div/div/ul/li[3]').click()
###########################################################################################################################
# Clica na sub-seção Stats Match
wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[1]/ul/ul/li[2]').click()
time.sleep(1)
# Recebe o elemento da página que contém todos os elementos de scouts da partida
stats = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[2]/table/tbody')
stats2 = stats.find_elements_by_tag_name('tr')
# Loop que vai selecionar cada elemento de scouts da partida e salvar em suas respectivas variáveis
for s in stats2:
scout = s.find_elements_by_tag_name('td')[1].text
scout_home = s.find_elements_by_tag_name('td')[0].text
scout_away = s.find_elements_by_tag_name('td')[2].text
# Estrutura temporário de dados que vai armazenar em cada linha um scout juntamente com as demais informações da partida.
temp_data = {'season': season,
'date': date,
'home_team': home_team,
'away_team': away_team,
'result_full': result_full,
'result_ht': result_ht,
'scout': scout,
'scout_home': scout_home,
'scout_away': scout_away,
'link_match': link_match}
# Insere os dados temporários no objeto final com todas as partidas.
data_matches.append(temp_data)
# Clica na sub-seção Head to Head
wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[1]/ul/ul/li[1]').click()
# Cria uma variável que vai receber informações do histórico de confrontos das equipes.
h2h_played = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[3]/p[2]').text
h2h_draw = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[3]/p[3]/span').text
h2h_home_total_wins = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[1]/div[1]/div[1]/div[2]').text
h2h_away_total_wins = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[2]/div[1]/div[1]/div[2]').text
h2h_home_wins_home = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[1]/div[2]/div[1]/div[2]').text
h2h_away_wins_home = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[2]/div[2]/div[1]/div[2]').text
h2h_home_wins_away = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[1]/div[3]/div[1]/div[2]').text
h2h_away_wins_away = wd.find_element_by_xpath('//*[@id="mainContent"]/div/section[2]/div[2]/div[2]/div[2]/section[3]/div[2]/div[1]/div[1]/div/div[1]/section/div[2]/div[3]/div[1]/div[2]').text
# Estrutura temporário de dados que vai armazenar em cada linha as informações dos confrontos entre as equipes.
temp_data2 = {'link_match': link_match,
'h2h_played': h2h_played,
'h2h_draw': h2h_draw,
'h2h_home_total_wins': h2h_home_total_wins,
'h2h_away_total_wins': h2h_away_total_wins,
'h2h_home_wins_home': h2h_home_wins_home,
'h2h_away_wins_home': h2h_away_wins_home,
'h2h_home_wins_away': h2h_home_wins_away,
'h2h_away_wins_away': h2h_away_wins_away}
# Insere os dados temporários no objeto final com todas as partidas.
data_h2h.append(temp_data2)
# Criando um DataFrame para cada uma das duas tabelas com os dados coletados
df_matches = pd.DataFrame(data_matches)
df_h2h = pd.DataFrame(data_h2h)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment