Created
August 24, 2018 21:21
-
-
Save schochastics/72f65e449138b685124611384ef6ec04 to your computer and use it in GitHub Desktop.
scrape mean age and market values for European Football leagues
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(rvest) | |
library(ggimage) | |
library(lubridate) | |
#get first 25 leagues in Europe ---- | |
url <- "https://www.transfermarkt.de/wettbewerbe/europa" | |
doc <- read_html(url) | |
leagues <- doc %>% html_nodes(".hauptlink a") %>% html_attr("href") | |
leagues <- leagues[seq(2,length(leagues),2)] | |
#function to scrape and plot ---- | |
plot_age_mv <- function(team_url){ | |
base.url <- "https://www.transfermarkt.de" | |
doc <- read_html(paste0(base.url,team_url)) | |
teams <- doc %>% | |
html_nodes(".items") %>% | |
html_table(fill=TRUE) %>% | |
.[[1]] %>% | |
.[-1,] | |
wappen <- doc %>% | |
html_nodes("td .tiny_wappen") %>% | |
html_attr("src") %>% | |
unique() | |
teams <- janitor::clean_names(teams) | |
#check if market value is in millions or hundred thousands | |
teams$val <- teams$gesamtmarktwert %>% str_extract("[a-zA-Z]+") | |
#league name | |
league <- doc %>% html_nodes(".spielername-profil") %>% html_text() | |
#league country | |
country <- doc %>% | |
html_table(fill=TRUE) %>% | |
.[[1]] %>% | |
.$X2 %>% | |
.[1] %>% | |
word(2,sep="-") %>% | |
str_trim() | |
#get team name, mean age and mean value | |
teams <- teams %>% | |
select(name,kader,gesamtmarktwert,val) %>% | |
mutate(age=as.numeric(str_replace(kader,",","."))) %>% | |
mutate(mw=str_replace(gesamtmarktwert,"[a-zA-Z]+\\. €","")) %>% | |
mutate(mw=as.numeric(str_replace(mw,",","."))) %>% | |
mutate(mw=ifelse(val=="Mio",mw,mw/1000)) %>% | |
select(name,age,mw) | |
#get bigger crests | |
teams$wappen <- str_replace(wappen,"/tiny/","/head/") | |
teams$league <- league | |
ggplot(teams,aes(x=age,y=mw))+geom_image(aes(image=wappen))+ | |
labs(x="average player age",y="average player value (million Euro)", | |
caption="data from transfermarkt.de", | |
title=paste0(league,"(",country,")"))+ | |
hrbrthemes::theme_ipsum_rc() -> p | |
return(list(data=teams,plot=p)) | |
} | |
#loop over all leagues and save the plots | |
df <- tibble() | |
for(i in seq_along(leagues)){ | |
print(i) | |
res <- plot_age_mv(leagues[i]) | |
out_name <- paste0(res$plot$labels$title,".png") | |
ggsave(out_name,res$plot,width = 5,height = 5) | |
df <- bind_rows(df,res$data) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment