Skip to content

Instantly share code, notes, and snippets.

@fformenti
Last active August 29, 2015 14:02
Show Gist options
  • Save fformenti/7c14b9b6681e038db860 to your computer and use it in GitHub Desktop.
Save fformenti/7c14b9b6681e038db860 to your computer and use it in GitHub Desktop.
World Cup 2014 Blogpost
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
Club;Ranking
Spain Real Madrid;1
Germany Bayern Munich;2
Italy Juventus;3
England Manchester City;4
Spain Barcelona;5
Germany Borussia Dortmund;6
France Paris Saint-Germain;7
Spain Atl?tico Madrid;8
England Liverpool;9
England Chelsea;10
Portugal Benfica;11
England Arsenal;12
Italy Napoli;13
England Manchester United;14
Italy Roma;15
England Everton;16
Ukraine Shakhtar Donetsk;17
Germany Schalke 04;18
Spain Athletic Bilbao;19
Netherlands Ajax;20
Portugal Porto;21
Greece Olympiacos;22
Germany VfL Wolfsburg;23
Czech Sparta;24
England Tottenham Hotspur;25
data_path_original <- '../../data/original/'
d<-readLines(paste0(data_path_original,'cup.txt'))
#d<-readLines("cup.txt")
(itab<-grep(pattern="DoB/Age",x=d))
icoach<-coach<-grep(pattern="Coach",x=d)
ictry<-coach-1
(endtabs<-c(ictry[-length(ictry)]+(diff(ictry))-1,length(d)-1))
(begintabs<-itab+1)
tabs<-vector("list",length(icoach))
hdr<-"";
for(i in 1:length(tabs)){
(ti<-d[begintabs[i]:endtabs[i]])
if(i==1){(hdr<-unlist(strsplit(d[begintabs[i]-1],"\t"))) }
(dat<-d[(itab[i]+1):(itab[i]+23)])
(dat<-data.frame(do.call("rbind",strsplit(dat,"\t"))))
names(dat)<-hdr
#dat<-apply(dat,2,as.character)
(dat$country<-d[ictry[i]])
(dat$group<-ceiling(i/4+0))
(dat$age<-as.numeric(substr(do.call("rbind",strsplit(as.character(dat$"DoB/Age"),"aged"))[,2],1,3)))
#print(dat)
tabs[[i]]<-dat
#scan(n=1)
}
orderdf<-function(df2,ncol=1){
if(!is.numeric(ncol)){ncol=which(names(df2)==ncol)}
df2<-df2[order(df2[,ncol],decreasing=T),]
Pos<-1:nrow(df2)
df2 <- within(df2,Pos <- factor(Pos,levels=names(sort(table(Pos),decreasing=TRUE))))
return(df2)
}
library(fields)
Kol<-colorRampPalette(c("lightblue","orange"))(32)
tKol<-rep("black",32)
d<-do.call("rbind",tabs)
d$League<-unlist(lapply(strsplit(as.character(d$Club), " "),function(x) x[1]))
d$League[d$League=="South"]<-"South Korea"
d$League[d$League=="United"]<-"United States"
d$League[d$League=="Ivory"]<-"Ivory Coast"
d$League[d$League=="Bosnia"]<-"Bosnia and Herzegovina"
d$League[d$League=="New"]<-"New Zealand"
d$League[d$League=="Costa"]<-"Costa Rica"
d$League[d$League=="Saudi"]<-"Saudi Arabia"
d$isHome<-apply(d[,c("country","League")],1,function(x) (x[1]==x[2]))
d$Home<-d$Foreign<-0
d$Home[which(d$isHome==T)]<-1
d$Foreign[which(d$isHome==F)]<-1
#naming the groups
A <- data.frame(seq(1, 8, by=1))
B <- data.frame(c("Group A","Group B","Group C","Group D","Group E","Group F","Group G","Group H"))
groups <- cbind(A,B)
names(groups) <- c("A","B")
d$group <- groups$B[match(d$group,groups$A)]
write.csv(d, file = paste0(data_path_original, "Master_dataframe.csv"))
library(ggplot2)
maxage <- aggregate(age ~ country+group, data = d, FUN = max)
maxage<-orderdf(maxage,ncol="age")
#ggplot(maxage,aes(x=country,y=age,fill=age))+geom_bar() +theme(axis.text.x=element_text(angle=-90)) +labs(title="Age of oldest in each Worldcup team") +facet_wrap(~group,scales="free_x",nrow=2)+theme(legend.position="none")+scale_colour_gradientn(colours=Kol)+scale_fill_gradientn(colours=Kol) + geom_text(aes(country,age,label=age),size=5)
#Average age
Average <- aggregate(age ~ country+group, data = d, FUN = function(x) round(mean(x),0))
Average<-orderdf(Average,ncol="age")
Average_pos <- aggregate(age ~ Pos., data = d, FUN = function(x) round(mean(x),0))
Average_pos<-orderdf(Average_pos,ncol="age")
#ggplot(Average,aes(x=country,y=age,fill=age))+geom_bar() +theme(axis.text.x=element_text(angle=-90)) +labs(title="Average age of World cup Teams") +facet_wrap(~group,scales="free_x",nrow=2)+theme(legend.position="none")+scale_colour_gradientn(colours=Kol)+scale_fill_gradientn(colours=Kol) + geom_text(aes(country,age,label=age),size=5)
#Yooungest team is netherlands
#Box plot
#p1<-ggplot(d,aes(x=country,y=age,fill=country))+geom_boxplot() +theme(axis.text.x=element_text(angle=-90)) +labs(title="Boxplot of age of World cup Teams") +facet_wrap(~group,scales="free_x",nrow=2)+theme(legend.position="none")+scale_colour_gradientn(colours=Kol)+ylab("Age(Years)")
#Home/foreign based players
library(reshape2)
df2 <- aggregate(Home ~ country+group, data = d, FUN = sum)
df3 <- aggregate(Foreign ~ country+group, data = d, FUN = sum)
df2<-merge(df2,df3,by=c("country","group"))
df2<-melt(df2, id.vars=c("country","group"))
names(df2)[3:4]<-c("Based","Number")
df2<-orderdf(df2,ncol="Number")
cols<-c("blue","pink")
#p2<-ggplot(df2,aes(x=country,y=Number,fill=factor(Based)))+geom_bar(position="dodge") +theme(axis.text.x=element_text(angle=-90)) +labs(title="Home/foreign based players") +facet_wrap(~group,scales="free_x",nrow=2)+scale_fill_manual(values=cols) +ylab("Number of Players")+theme(legend.position="bottom")
#countries with more foreign based players
#Clubs with most players
library(plyr)
df_club <- count(d, c('Club'))
df_club<-df_club[order(df_club$freq,decreasing=T),]
df_club<-df_club[which(df_club$freq>=5),]
Pos<-1:nrow(df_club)
df_club <- within(df_club,Pos <- factor(Pos,levels=names(sort(table(Pos),decreasing=TRUE))))
aKol<-colorRampPalette(c("red","blue"))(nrow(df_club))
## plot
#p3<-ggplot(df_club,aes(x=Pos,y=freq,fill=freq))+geom_bar(stat="identity",binwidth=1) +theme(axis.text.x=element_text(angle=-90)) +labs(title="Clubs with most players in the world cup (>4)")+scale_x_discrete(breaks=df_club$Pos, labels=df_club$Club)+theme(legend.position="none")+scale_fill_gradientn(colours=aKol)+xlab("Club") +ylab("Number")#+geom_text(aes(Pos,df2$freq,label=freq),size=5)
#leagues with most Players
df_league <- count(d, c('League'))
df_league<-df_league[order(df_league$freq,decreasing=T),]
df_league<-df_league[which(df_league$freq>=5),]
Pos<-1:nrow(df_league)
df_league <- within(df_league,Pos <- factor(Pos,levels=names(sort(table(Pos),decreasing=TRUE))))
aKol<-colorRampPalette(c("blue","cadetblue","lightblue","pink","red"))(nrow(df_league))
## plot
#p4<-ggplot(df_league,aes(x=Pos,y=freq,fill=freq))+geom_bar(stat="identity",binwidth=1) +theme(axis.text.x=element_text(angle=-90)) +labs(title="Leagues with most players in the world cup (>10)")+scale_x_discrete(breaks=df_league$Pos, labels=df_league$League)+theme(legend.position="none")+scale_fill_gradientn(colours=aKol) +xlab("Country")+ylab("Number")
library(XML)
library(lubridate)
#data_path = "../../data/"
#dates
allDates <- c("2014-06-05","2014-05-08","2014-04-10","2014-03-13","2014-02-13","2014-01-16","2013-12-19","2013-11-28","2013-10-17","2013-09-12","2013-08-08","2013-07-04","2013-06-06","2013-05-09","2013-04-11","2013-03-14","2013-02-14","2013-01-17","2012-12-19","2012-11-07","2012-10-03","2012-09-05","2012-08-08","2012-07-04","2012-06-06")
# scrape and compose data.frame of FIFA rankings
allFIFARankings <- lapply(allDates, function(d){
dd <- strftime(d, "%Y-%m-%d", "GMT")
#cat(dd, '\n')
u <- paste0("http://en.fifaranking.net/ranking/?d=", dd , "&rnkp=1")
tableNodes = getNodeSet(htmlParse(u), "//table")
df1 <- readHTMLTable(tableNodes[[1]],
colClasses = c("numeric","character","character","character"),
stringsAsFactors = FALSE)
u <- paste0("http://en.fifaranking.net/ranking/?d=", dd , "&rnkp=2")
tableNodes = getNodeSet(htmlParse(u), "//table")
df2 <- readHTMLTable(tableNodes[[1]],
colClasses = c("numeric","character","character","character"),
stringsAsFactors = FALSE)
df <- rbind(df1,df2)
df$Date <- d
df
})
historicalFIFARankings <- do.call(rbind, allFIFARankings)
historicalFIFARankings <- historicalFIFARankings[,c(1,3,4,10)]
names(historicalFIFARankings) <- c("rank","points","country","date")
historicalFIFARankings$points <- gsub(pattern= "pts", replacement="", x=historicalFIFARankings$points)
historicalFIFARankings$points <- as.numeric(historicalFIFARankings$points)
write.csv(historicalFIFARankings, file = paste0(data_path, "FIFA_Rankings.csv"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment