-
-
Save kivanio/f82dab7356f6f3968657da31955498b5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(ggplot2) | |
library(httr) | |
setwd("C:/Dropbox/Projects/20160206_Soccer_Scores") | |
if (!file.exists("20160206_Soccer_Scores.csv.gz")) { | |
cat("must reread") | |
} | |
if (!file.exists("20160206_Soccer_Scores.csv.gz")) | |
{ | |
cat("Reading data from server\n") | |
STARTYEAR = 1993 | |
STOPYEAR = 2015 #Stop at 2015 to get 2015-2016 data | |
yearstrings = vector(mode = "character", length = STOPYEAR - STARTYEAR + | |
1) | |
for (i in STARTYEAR:STOPYEAR) { | |
yearstrings[i - STARTYEAR + 1] = | |
paste( | |
substr(as.character(i), start = 3, stop = 4), | |
substr(as.character(i + 1), start = 3, stop = 4), | |
sep = "" | |
) | |
} | |
reslist = vector(mode = "list", length = 1000) | |
i = 1 | |
for (yearstring in yearstrings) { | |
for (confstring in c("E0", "E1", "E2", "E3", "EC")) { | |
aurl = paste( | |
"http://www.football-data.co.uk/mmz4281/", | |
yearstring, | |
"/", | |
confstring, | |
".csv", | |
sep = "" | |
) | |
if (http_status(GET(aurl))$category == "Success") | |
{ | |
reslist[[i]] = read.csv(url(aurl)) | |
i = i + 1 | |
} else { | |
cat("Failed: ", aurl, "\n") | |
} | |
} | |
} | |
if (exists("reslist2")) { | |
rm(reslist2) | |
} | |
reslist2 = vector(mode = "list", length = 50) | |
for (i in 1:(5 * (STOPYEAR - STARTYEAR + 1))) { | |
reslist2[[i]] = reslist[[i]][, c(1:6)] | |
} | |
df = do.call(rbind, reslist2) | |
df = subset(df, !is.na(FTHG)) | |
write.csv(df, | |
file = gzfile("20160206_Soccer_Scores.csv.gz"), | |
row.names = FALSE) | |
} | |
get_neighbors = function(low, high) { | |
#returns own score plus neighbors | |
if (high < low) | |
stop("high > low in get_neighbors") | |
rbind(expand.grid(a = (low - 1):(low + 1), b = high), | |
expand.grid(a = low, b = (high - 1):(high + 1))) %>% | |
filter(a >= 0 & | |
b >= 0 & | |
b >= a) %>% | |
mutate(neighbor = paste(b, a, sep = "-"), | |
neighbor_group = paste(high, low, sep = "-")) %>% | |
group_by(neighbor_group, neighbor) %>% | |
summarise() | |
} | |
#create the neighbor data frame ndf | |
curr = 1 | |
reslist = vector(mode = "list", length = 1000) | |
for (i in 0:11) { | |
for (j in i:11) { | |
reslist[[curr]] = get_neighbors(i, j) | |
curr = curr + 1 | |
} | |
} | |
ndf = do.call('rbind', reslist) | |
ndf_summ = ndf %>% group_by(neighbor_group) %>% summarise(count = length(neighbor)) %>% arrange(-count) %>% ungroup() | |
df = read.csv("20160206_Soccer_Scores.csv.gz") | |
df = df %>% | |
mutate( | |
Low = ifelse(FTHG < FTAG, FTHG, FTAG), | |
High = ifelse(FTHG < FTAG, FTAG, FTHG), | |
Total = FTHG + FTAG | |
) | |
#Total points scored hist | |
p = ggplot(df, aes(x = Total)) | |
p = p + geom_bar( | |
stat = "bin", | |
binwidth = 1, | |
fill = "green", | |
color = "black", | |
width = 2, | |
alpha = .2 | |
) | |
p = p + scale_x_continuous(breaks = seq(-.5, 12.5, by = 1), | |
labels = c("", 0:12)) | |
p = p + coord_cartesian(xlim = c(0.1, 12)) | |
p = p + labs(x = "Total Goals", y = "Matches", title = "Distribution of Total Goals / Match \n(N = 52,017 Matches)") | |
p = p + theme(panel.grid = element_blank()) | |
p | |
ggsave( | |
plot = p, | |
file = "Total_Points_Scored.png", | |
width = 4, | |
height = 4 | |
) | |
matcher = | |
rbind(expand.grid(1, 1:3), | |
expand.grid(0:2, 2))[-5, ] %>% | |
mutate(match = TRUE) | |
names(matcher) = c("Low", "High", "Match") | |
df = left_join(df, matcher) %>% | |
mutate(Score = paste(High, Low, sep = "-")) | |
df_summ = df %>% group_by(Score) %>% | |
summarise(Count = n(), | |
Goals = Low[1] + High[1]) %>% ungroup() | |
df_summ = df_summ %>% | |
arrange(-Count) %>% | |
mutate(CumProp = cumsum(Count) / sum(Count), | |
Prop = Count / sum(Count)) | |
df_summ2 = df_summ %>% | |
group_by(Goals) %>% | |
summarise(Proportion = sum(Prop)) %>% | |
ungroup() %>% | |
mutate(TotalProp = cumsum(Proportion)) | |
p = ggplot(df_summ %>% filter(CumProp < .95), aes(y = 100 * Prop, x = reorder(Score, Prop))) + | |
geom_bar( | |
stat = 'identity', | |
fill = "green", | |
color = "black", | |
alpha = .2 | |
) + | |
coord_flip() + | |
labs(x = "Score", y = "Percentage of Games", title = "Most Common Final Scores") + | |
theme(legend.position = "bottom") | |
p = p + theme(panel.grid.major = element_blank()) | |
p | |
ggsave( | |
plot = p, | |
file = "Most_Common_Scores.png", | |
width = 4, | |
height = 4 | |
) | |
###NOW JUST JOIN df_summ and NDF by x=Score y=neighbor | |
#Games within 1 point hist | |
plot_data = left_join(ndf, df_summ, by = c("neighbor" = "Score")) %>% select(-CumProp) %>% | |
group_by(neighbor_group) %>% | |
summarise( | |
group_count = sum(Count, na.rm = TRUE), | |
group_prop = sum(Prop, na.rm = TRUE), | |
obs = n() | |
) %>% | |
arrange(-group_count) %>% | |
ungroup() %>% | |
filter(group_count > 0) %>% | |
filter(group_prop > .1) | |
plot_data$score = factor(plot_data$neighbor_group, levels = as.character(plot_data$neighbor_group)) | |
p = ggplot(plot_data, aes(x = score, y = 100 * group_prop)) | |
p = p + geom_bar( | |
stat = "identity", | |
fill = "green", | |
color = "black", | |
alpha = .2 | |
) | |
p = p + labs(x = "Score", y = "% of Matches Within 1 Point of Score", | |
title = "% of Matches Within 1 Point \nof Various Scores") | |
p = p + scale_y_continuous(breaks = seq(0, 55, by = 5)) | |
p = p + theme(panel.grid.major = element_blank()) | |
p | |
ggsave( | |
plot = p, | |
file = "Within_One_Point.png", | |
width = 4, | |
height = 4 | |
) | |
#show all the games within 1 of a given score | |
plot_data = left_join(ndf, df_summ, by = c("neighbor" = "Score")) | |
plot_data = subset(plot_data, neighbor_group %in% c("2-1", "2-0", "1-0", "1-1")) | |
plot_data$neighbor_group = factor(plot_data$neighbor_group, levels = c("2-1", "2-0", "1-0", "1-1")) | |
p = ggplot(plot_data, aes(x = neighbor, y = Count)) | |
p = p + geom_bar( | |
stat = "identity", | |
fill = "green", | |
color = "black", | |
alpha = .2 | |
) | |
p = p + labs(x = "Scores in Score Group", y = "Number of Matches", title = | |
"Frequency of Scores Within Score Groups") | |
p = p + facet_grid(neighbor_group ~ .) | |
p | |
ggsave( | |
plot = p, | |
file = "Breakdown.png", | |
width = 4, | |
height = 4 | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment