Last active
August 29, 2015 14:02
-
-
Save fformenti/7c14b9b6681e038db860 to your computer and use it in GitHub Desktop.
World Cup 2014 Blogpost
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Club;Ranking | |
Spain Real Madrid;1 | |
Germany Bayern Munich;2 | |
Italy Juventus;3 | |
England Manchester City;4 | |
Spain Barcelona;5 | |
Germany Borussia Dortmund;6 | |
France Paris Saint-Germain;7 | |
Spain Atl?tico Madrid;8 | |
England Liverpool;9 | |
England Chelsea;10 | |
Portugal Benfica;11 | |
England Arsenal;12 | |
Italy Napoli;13 | |
England Manchester United;14 | |
Italy Roma;15 | |
England Everton;16 | |
Ukraine Shakhtar Donetsk;17 | |
Germany Schalke 04;18 | |
Spain Athletic Bilbao;19 | |
Netherlands Ajax;20 | |
Portugal Porto;21 | |
Greece Olympiacos;22 | |
Germany VfL Wolfsburg;23 | |
Czech Sparta;24 | |
England Tottenham Hotspur;25 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data_path_original <- '../../data/original/' | |
d<-readLines(paste0(data_path_original,'cup.txt')) | |
#d<-readLines("cup.txt") | |
(itab<-grep(pattern="DoB/Age",x=d)) | |
icoach<-coach<-grep(pattern="Coach",x=d) | |
ictry<-coach-1 | |
(endtabs<-c(ictry[-length(ictry)]+(diff(ictry))-1,length(d)-1)) | |
(begintabs<-itab+1) | |
tabs<-vector("list",length(icoach)) | |
hdr<-""; | |
for(i in 1:length(tabs)){ | |
(ti<-d[begintabs[i]:endtabs[i]]) | |
if(i==1){(hdr<-unlist(strsplit(d[begintabs[i]-1],"\t"))) } | |
(dat<-d[(itab[i]+1):(itab[i]+23)]) | |
(dat<-data.frame(do.call("rbind",strsplit(dat,"\t")))) | |
names(dat)<-hdr | |
#dat<-apply(dat,2,as.character) | |
(dat$country<-d[ictry[i]]) | |
(dat$group<-ceiling(i/4+0)) | |
(dat$age<-as.numeric(substr(do.call("rbind",strsplit(as.character(dat$"DoB/Age"),"aged"))[,2],1,3))) | |
#print(dat) | |
tabs[[i]]<-dat | |
#scan(n=1) | |
} | |
orderdf<-function(df2,ncol=1){ | |
if(!is.numeric(ncol)){ncol=which(names(df2)==ncol)} | |
df2<-df2[order(df2[,ncol],decreasing=T),] | |
Pos<-1:nrow(df2) | |
df2 <- within(df2,Pos <- factor(Pos,levels=names(sort(table(Pos),decreasing=TRUE)))) | |
return(df2) | |
} | |
library(fields) | |
Kol<-colorRampPalette(c("lightblue","orange"))(32) | |
tKol<-rep("black",32) | |
d<-do.call("rbind",tabs) | |
d$League<-unlist(lapply(strsplit(as.character(d$Club), " "),function(x) x[1])) | |
d$League[d$League=="South"]<-"South Korea" | |
d$League[d$League=="United"]<-"United States" | |
d$League[d$League=="Ivory"]<-"Ivory Coast" | |
d$League[d$League=="Bosnia"]<-"Bosnia and Herzegovina" | |
d$League[d$League=="New"]<-"New Zealand" | |
d$League[d$League=="Costa"]<-"Costa Rica" | |
d$League[d$League=="Saudi"]<-"Saudi Arabia" | |
d$isHome<-apply(d[,c("country","League")],1,function(x) (x[1]==x[2])) | |
d$Home<-d$Foreign<-0 | |
d$Home[which(d$isHome==T)]<-1 | |
d$Foreign[which(d$isHome==F)]<-1 | |
#naming the groups | |
A <- data.frame(seq(1, 8, by=1)) | |
B <- data.frame(c("Group A","Group B","Group C","Group D","Group E","Group F","Group G","Group H")) | |
groups <- cbind(A,B) | |
names(groups) <- c("A","B") | |
d$group <- groups$B[match(d$group,groups$A)] | |
write.csv(d, file = paste0(data_path_original, "Master_dataframe.csv")) | |
library(ggplot2) | |
maxage <- aggregate(age ~ country+group, data = d, FUN = max) | |
maxage<-orderdf(maxage,ncol="age") | |
#ggplot(maxage,aes(x=country,y=age,fill=age))+geom_bar() +theme(axis.text.x=element_text(angle=-90)) +labs(title="Age of oldest in each Worldcup team") +facet_wrap(~group,scales="free_x",nrow=2)+theme(legend.position="none")+scale_colour_gradientn(colours=Kol)+scale_fill_gradientn(colours=Kol) + geom_text(aes(country,age,label=age),size=5) | |
#Average age | |
Average <- aggregate(age ~ country+group, data = d, FUN = function(x) round(mean(x),0)) | |
Average<-orderdf(Average,ncol="age") | |
Average_pos <- aggregate(age ~ Pos., data = d, FUN = function(x) round(mean(x),0)) | |
Average_pos<-orderdf(Average_pos,ncol="age") | |
#ggplot(Average,aes(x=country,y=age,fill=age))+geom_bar() +theme(axis.text.x=element_text(angle=-90)) +labs(title="Average age of World cup Teams") +facet_wrap(~group,scales="free_x",nrow=2)+theme(legend.position="none")+scale_colour_gradientn(colours=Kol)+scale_fill_gradientn(colours=Kol) + geom_text(aes(country,age,label=age),size=5) | |
#Yooungest team is netherlands | |
#Box plot | |
#p1<-ggplot(d,aes(x=country,y=age,fill=country))+geom_boxplot() +theme(axis.text.x=element_text(angle=-90)) +labs(title="Boxplot of age of World cup Teams") +facet_wrap(~group,scales="free_x",nrow=2)+theme(legend.position="none")+scale_colour_gradientn(colours=Kol)+ylab("Age(Years)") | |
#Home/foreign based players | |
library(reshape2) | |
df2 <- aggregate(Home ~ country+group, data = d, FUN = sum) | |
df3 <- aggregate(Foreign ~ country+group, data = d, FUN = sum) | |
df2<-merge(df2,df3,by=c("country","group")) | |
df2<-melt(df2, id.vars=c("country","group")) | |
names(df2)[3:4]<-c("Based","Number") | |
df2<-orderdf(df2,ncol="Number") | |
cols<-c("blue","pink") | |
#p2<-ggplot(df2,aes(x=country,y=Number,fill=factor(Based)))+geom_bar(position="dodge") +theme(axis.text.x=element_text(angle=-90)) +labs(title="Home/foreign based players") +facet_wrap(~group,scales="free_x",nrow=2)+scale_fill_manual(values=cols) +ylab("Number of Players")+theme(legend.position="bottom") | |
#countries with more foreign based players | |
#Clubs with most players | |
library(plyr) | |
df_club <- count(d, c('Club')) | |
df_club<-df_club[order(df_club$freq,decreasing=T),] | |
df_club<-df_club[which(df_club$freq>=5),] | |
Pos<-1:nrow(df_club) | |
df_club <- within(df_club,Pos <- factor(Pos,levels=names(sort(table(Pos),decreasing=TRUE)))) | |
aKol<-colorRampPalette(c("red","blue"))(nrow(df_club)) | |
## plot | |
#p3<-ggplot(df_club,aes(x=Pos,y=freq,fill=freq))+geom_bar(stat="identity",binwidth=1) +theme(axis.text.x=element_text(angle=-90)) +labs(title="Clubs with most players in the world cup (>4)")+scale_x_discrete(breaks=df_club$Pos, labels=df_club$Club)+theme(legend.position="none")+scale_fill_gradientn(colours=aKol)+xlab("Club") +ylab("Number")#+geom_text(aes(Pos,df2$freq,label=freq),size=5) | |
#leagues with most Players | |
df_league <- count(d, c('League')) | |
df_league<-df_league[order(df_league$freq,decreasing=T),] | |
df_league<-df_league[which(df_league$freq>=5),] | |
Pos<-1:nrow(df_league) | |
df_league <- within(df_league,Pos <- factor(Pos,levels=names(sort(table(Pos),decreasing=TRUE)))) | |
aKol<-colorRampPalette(c("blue","cadetblue","lightblue","pink","red"))(nrow(df_league)) | |
## plot | |
#p4<-ggplot(df_league,aes(x=Pos,y=freq,fill=freq))+geom_bar(stat="identity",binwidth=1) +theme(axis.text.x=element_text(angle=-90)) +labs(title="Leagues with most players in the world cup (>10)")+scale_x_discrete(breaks=df_league$Pos, labels=df_league$League)+theme(legend.position="none")+scale_fill_gradientn(colours=aKol) +xlab("Country")+ylab("Number") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
library(lubridate) | |
#data_path = "../../data/" | |
#dates | |
allDates <- c("2014-06-05","2014-05-08","2014-04-10","2014-03-13","2014-02-13","2014-01-16","2013-12-19","2013-11-28","2013-10-17","2013-09-12","2013-08-08","2013-07-04","2013-06-06","2013-05-09","2013-04-11","2013-03-14","2013-02-14","2013-01-17","2012-12-19","2012-11-07","2012-10-03","2012-09-05","2012-08-08","2012-07-04","2012-06-06") | |
# scrape and compose data.frame of FIFA rankings | |
allFIFARankings <- lapply(allDates, function(d){ | |
dd <- strftime(d, "%Y-%m-%d", "GMT") | |
#cat(dd, '\n') | |
u <- paste0("http://en.fifaranking.net/ranking/?d=", dd , "&rnkp=1") | |
tableNodes = getNodeSet(htmlParse(u), "//table") | |
df1 <- readHTMLTable(tableNodes[[1]], | |
colClasses = c("numeric","character","character","character"), | |
stringsAsFactors = FALSE) | |
u <- paste0("http://en.fifaranking.net/ranking/?d=", dd , "&rnkp=2") | |
tableNodes = getNodeSet(htmlParse(u), "//table") | |
df2 <- readHTMLTable(tableNodes[[1]], | |
colClasses = c("numeric","character","character","character"), | |
stringsAsFactors = FALSE) | |
df <- rbind(df1,df2) | |
df$Date <- d | |
df | |
}) | |
historicalFIFARankings <- do.call(rbind, allFIFARankings) | |
historicalFIFARankings <- historicalFIFARankings[,c(1,3,4,10)] | |
names(historicalFIFARankings) <- c("rank","points","country","date") | |
historicalFIFARankings$points <- gsub(pattern= "pts", replacement="", x=historicalFIFARankings$points) | |
historicalFIFARankings$points <- as.numeric(historicalFIFARankings$points) | |
write.csv(historicalFIFARankings, file = paste0(data_path, "FIFA_Rankings.csv")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment