Created
August 22, 2016 00:23
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("~/Desktop/web scraping") | |
library(dplyr) | |
library(dygraphs) | |
library(reshape2) | |
library(ggplot2) | |
library(plotly) | |
library(car) | |
library(TSA) | |
library(forecast) | |
library(xts) | |
library(Rmisc) | |
library(NbClust) | |
library(cluster) | |
library(factoextra) | |
library(ggrepel) | |
library(corrplot) | |
library(PerformanceAnalytics) | |
output=read.csv('output.txt', header = T, stringsAsFactors = F) | |
output2012=filter(output, year==2012) | |
#summary(output) | |
#sapply(output[,c('Gold', 'Silver', 'Bronze', 'Total')], sd) | |
# pick the top countries by gold and total | |
output_bycountry=output%>% | |
dplyr::group_by(Country)%>% | |
summarise(totoalgold=sum(Gold), avggold=mean(Gold))%>% | |
arrange(desc(avggold)) | |
head(output_bycountry,20) | |
# Country totoalgold avggold | |
# <chr> <int> <dbl> | |
# 1 Unified Team 45 45.000000 | |
# 2 Soviet Union 395 43.888889 | |
# 3 United States 990 36.666667 (1) | |
# 4 East Germany 153 30.600000 | |
# 5 China 201 25.125000 (2) | |
# 6 Russia 134 19.142857 (3) | |
# 7 Germany 213 11.210526 (4) | |
# 8 West Germany 56 11.200000 | |
# 9 Great Britain 245 8.750000 (5) | |
# 10 Italy 211 8.115385 (6) | |
# 11 France 222 7.928571 (7) | |
# 12 Ukraine 33 6.600000 | |
# 13 Hungary 170 6.538462 (8) | |
# 14 Japan 130 6.500000 (9) | |
# 15 Sweden 144 5.538462 (10) | |
# 16 South Korea 81 5.400000 (11) | |
# 17 Australia 138 5.307692 (12) | |
# 18 Cuba 71 5.071429 | |
# 19 Romania 88 4.888889 | |
# 20 Finland 106 4.240000 | |
output_bycountry2=output%>% | |
dplyr::group_by(Country)%>% | |
summarise(totoalmetal=sum(Total), avgmetal=mean(Total))%>% | |
arrange(desc(avgmetal)) | |
head(output_bycountry2,20) | |
# Country totoalmetal avgmetal | |
# <chr> <int> <dbl> | |
# 1 Soviet Union 1010 112.22222 | |
# 2 Unified Team 112 112.00000 | |
# 3 United States 2425 89.81481 (1) | |
# 4 East Germany 409 81.80000 | |
# 5 China 473 59.12500 (2) | |
# 6 Russia 405 57.85714 (3) | |
# 7 West Germany 204 40.80000 | |
# 8 Germany 728 38.31579 (4) | |
# 9 Great Britain 811 28.96429 (5) | |
# 10 France 727 25.96429 (6) | |
# 11 Ukraine 115 23.00000 | |
# 12 Italy 577 22.19231 (7) | |
# 13 Japan 400 20.00000 (8) | |
# 14 Sweden 498 19.15385 (9) | |
# 15 Hungary 490 18.84615 (10) | |
# 16 Australia 475 18.26923 (11) | |
# 17 Romania 301 16.72222 | |
# 18 South Korea 243 16.20000 (12) | |
# 19 Belarus 75 15.00000 | |
# 20 Cuba 202 14.42857 | |
# pick the top countries | |
top10=dplyr::filter(output, Country %in% c('United States','China', 'Great Britain', | |
'Russia', 'South Korea', 'Germany', | |
'France', 'Italy', 'Hungary', | |
'Japan', 'Australia', 'Brazil', 'Sweden')) | |
# plot dygraphs | |
df2=dcast(top10[,c('Country', 'Total', 'year')], year~ Country,value.var="Total") | |
color=c('#dddfd4', '#fae596', '#3fb0ac', '#173e43', '#98dafc', '#e62739', '#9068be', | |
'#dbc3d0', '#6a5750', '#e05915', '#bccbde', '#300032', '#6534ff') | |
dygraph(df2) %>% | |
dyOptions(colors = color) %>% | |
dyRangeSelector() | |
# ggplot of boxplot | |
ggplot(data=top10, aes(x=reorder(Country,Total), y=Total, color=Country))+ | |
geom_boxplot()+ | |
xlab('Country')+ | |
ylab('Number of total medals')+ | |
ggtitle('Medal distribution by top leading countries of the 28 summer Olympics')+ | |
theme_bw()+ | |
theme(legend.position='none', axis.text.x = element_text(angle = 45, hjust = 1)) | |
# population | |
population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F) | |
summary(population) | |
population1_2012=population %>% | |
dplyr::filter(Place %in% output2012$Country)%>% | |
dplyr::select(Country=Place, Population=X2012) | |
population2_2012=population %>% | |
dplyr::filter(Place %in% c('China, Hong Kong SAR', | |
'United Kingdom', | |
'Iran (Islamic Republic of)', | |
'Republic of Moldova', | |
"Dem. People's Republic of Korea", | |
'Russian Federation', | |
'Republic of Korea', | |
'United States of America', | |
'Venezuela (Bolivarian Republic of)')) %>% | |
dplyr::select(Country=Place, Population=X2012) | |
population2_2012$Country=c('Hong Kong','North Korea','South Korea', | |
'Iran', 'Moldova', | |
'Russia', 'Great Britain', | |
'Venezuela', 'United States') | |
population_2012=rbind(population1_2012, population2_2012) | |
taibei=cbind(Country=c('Chinese Taipei'), Population=as.numeric(23234.940)) | |
population_2012=rbind(population_2012, taibei) | |
output2012_bypopulation=merge(x=output2012, y=population_2012, by='Country', all.x=T) | |
#GDP | |
GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F) | |
GDP1_2012=GDP %>% | |
dplyr::filter(Country.Name %in% output2012$Country)%>% | |
dplyr::select(Country=Country.Name, GDP=X2012) | |
merge(x=output2012, y=GDP1_2012, by='Country', all.x=T) | |
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China', | |
'Iran, Islamic Rep.', 'Russian Federation', | |
'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB') | |
GDP2_2012=GDP %>% | |
dplyr::filter(Country.Name %in% addition)%>% | |
dplyr::select(Country=Country.Name, GDP=X2012) | |
GDP2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong', | |
'Iran', 'South Korea', 'Russia', 'Slovakia', 'Venezuela') | |
GDP_2012=rbind(GDP1_2012, GDP2_2012) | |
taipei=cbind(Country=c('Chinese Taipei', 'North Korea'), GDP=as.numeric(42201.09, 1800)) | |
GDP_2012=rbind(GDP_2012, taipei) | |
output2012_bypopulationGDP=merge(x=output2012_bypopulation, y=GDP_2012, by='Country', all.x=T) | |
output2012_bypopulationGDP['Population']=as.numeric(output2012_bypopulationGDP$Population) | |
output2012_bypopulationGDP['GDP']=as.numeric(output2012_bypopulationGDP$GDP) | |
#GDP growth | |
GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F) | |
GDPgrowth1_2012=GDPgrowth %>% | |
dplyr::filter(Country.Name %in% output2012$Country)%>% | |
dplyr::select(Country=Country.Name, GDPgrowth=X2012) | |
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China', | |
'Iran, Islamic Rep.', 'Russian Federation', | |
'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB') | |
GDPgrowth2_2012=GDPgrowth %>% | |
dplyr::filter(Country.Name %in% addition)%>% | |
dplyr::select(Country=Country.Name, GDPgrowth=X2012) | |
GDPgrowth2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong', | |
'Iran','South Korea', 'Russia', 'Slovakia', 'Venezuela') | |
GDPgrowth_2012=rbind(GDPgrowth1_2012, GDPgrowth2_2012) | |
taipei1=cbind(Country=c('Chinese Taipei', 'North Korea'), GDPgrowth=as.numeric(2.06, 1.3)) | |
GDPgrowth_2012=rbind(GDPgrowth_2012, taipei1) | |
output2012_bypopulationGDP1=merge(x=output2012_bypopulationGDP, y=GDPgrowth_2012, by='Country', all.x=T) | |
output2012_bypopulationGDP1$GDPgrowth=as.numeric(output2012_bypopulationGDP1$GDPgrowth) | |
#life expectency | |
life=read.csv('life.csv', header = T, stringsAsFactors = F) | |
trim.leading <- function (x) sub("^\\s+", "", x) | |
life$Country=trim.leading(life$Country) | |
life_2012=life %>% | |
dplyr::filter(Country %in% output2012$Country)%>% | |
dplyr::select(Country, Life) | |
merge(x=output2012_bypopulationGDP1,y=life_2012, by='Country', all.x=T)%>% | |
dplyr::select(Country, Life) | |
addition=c('The Bahamas', 'United Kingdom') | |
life1_2012=life %>% | |
dplyr::filter(Country %in% addition)%>% | |
dplyr::select(Country, Life) | |
life1_2012$Country=c('Great Britain', 'Bahamas') | |
Taibei=cbind(Country=c('Chinese Taipei', 'Montenegro'), | |
Life=as.numeric(79.5, 74.65)) | |
life2_2012=rbind(life_2012,life1_2012,Taibei) | |
output2012_bypopulationGDP2=merge(x=output2012_bypopulationGDP1, | |
y=life2_2012, by='Country', all.x=T) | |
output2012_bypopulationGDP2$Life=as.numeric(output2012_bypopulationGDP2$Life) | |
summary(output2012_bypopulationGDP2) | |
#kmeans model | |
df=dplyr::select(output2012_bypopulationGDP2,Country,Total, Population,GDP, GDPgrowth, Life) | |
chart.Correlation(df[,-1], histogram=TRUE, pch=19) | |
df$Population=df$Population/1000 | |
df$GDP=df$GDP/1000 | |
ddf=scale(df[,-1]) | |
#d = dist(as.data.frame(scale(df[,-1]))) | |
#fit.average = hclust(d, method = "complete") | |
# par(mfrow = c(1, 1)) | |
# plot(fit.average, hang = -1, main = "Dendrogram of Average Linkage\n5 Clusters") | |
# rect.hclust(fit.average, k = 5) | |
set.seed(1234) | |
fviz_nbclust(ddf, kmeans, method = "silhouette") | |
# nc <- NbClust(ddf, min.nc=2, max.nc=15, method="kmeans") | |
# table(nc$Best.n[1,]) | |
# barplot(table(nc$Best.n[1,]), | |
# xlab="Numer of Clusters", ylab="Number of Criteria") | |
set.seed(1234) | |
fit.km <- kmeans(ddf, 5, nstart=25) | |
fviz_cluster(fit.km, data = ddf, geom = "point", | |
stand = FALSE, frame.type = "norm") | |
# fit.km$size | |
# fit.km$centers | |
# aggregate(df[,-1], by=list(cluster=fit.km$cluster), mean) | |
df$cluster <- as.factor(fit.km$cluster) | |
df1=df%>% | |
dplyr::filter(cluster==1)%>% | |
arrange(desc(Total)) | |
df2=df%>% | |
dplyr::filter(cluster==2)%>% | |
arrange(desc(Total)) | |
df3=df%>% | |
dplyr::filter(cluster==3)%>% | |
arrange(desc(Total)) | |
df4=df%>% | |
dplyr::filter(cluster==4)%>% | |
arrange(desc(Total)) | |
df5=df%>% | |
dplyr::filter(cluster==5)%>% | |
arrange(desc(Total)) | |
df8=rbind(df1, head(df4,5), head(df3, 5), | |
head(df2, 5), | |
df5) | |
g1=ggplot(df8, aes(Total,Population, color =cluster)) + | |
geom_point()+ | |
geom_text_repel(aes(label=Country))+ | |
theme_classic()+ | |
theme_bw() | |
g2=ggplot(df8, aes(Total,GDP, color =cluster)) + | |
geom_point()+ | |
geom_text_repel(aes(label=Country))+ | |
theme_classic()+ | |
theme_bw() | |
g7=ggplot(df8, aes(Total,GDPgrowth, color =cluster)) + | |
geom_point()+ | |
geom_text_repel(aes(label=Country))+ | |
theme_classic()+ | |
theme_bw() | |
g9=ggplot(df8, aes(Total,Life, color =cluster)) + | |
geom_point()+ | |
geom_text_repel(aes(label=Country))+ | |
theme_classic()+ | |
theme_bw() | |
multiplot(g1,g2, g7, g9,cols = 2) | |
g3=ggplot(df, aes(x=reorder(cluster, Total), y=Total, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw() | |
g4=ggplot(df, aes(x=reorder(cluster, Population), y=Population, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw() | |
g5=ggplot(df, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw() | |
g8=ggplot(df, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw() | |
g10=ggplot(df, aes(x=reorder(cluster, Life), y=Life, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw() | |
multiplot(g3,g4,g5,g8,g10, cols=2) | |
# 2008 | |
output=read.csv('output.txt', header = T, stringsAsFactors = F) | |
output2008=filter(output, year==2008) | |
# population | |
population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F) | |
population1_2008=population %>% | |
dplyr::filter(Place %in% output2008$Country)%>% | |
select(Country=Place, Population=X2008) | |
population2_2008=population %>% | |
dplyr::filter(Place %in% c('China, Hong Kong SAR', | |
'United Kingdom', | |
'Iran (Islamic Republic of)', | |
'Republic of Moldova', | |
"Dem. People's Republic of Korea", | |
'Russian Federation', | |
'Republic of Korea', | |
'United States of America', | |
'Venezuela (Bolivarian Republic of)')) %>% | |
select(Country=Place, Population=X2008) | |
population2_2008$Country=c('Hong Kong','North Korea','South Korea', | |
'Iran', 'Moldova', | |
'Russia', 'Great Britain', | |
'Venezuela', 'United States') | |
population_2008=rbind(population1_2008, population2_2008) | |
taibei=cbind(Country=c('Chinese Taipei', 'Vietnam'), Population=as.numeric(23040, 85120)) | |
population_2008=rbind(population_2008, taibei) | |
output2008_bypopulation=merge(x=output2008, y=population_2008, by='Country', all.x=T) | |
#GDP | |
GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F) | |
GDP1_2008=GDP %>% | |
dplyr::filter(Country.Name %in% output2008$Country)%>% | |
select(Country=Country.Name, GDP=X2008) | |
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China', | |
'Iran, Islamic Rep.', | |
'Venezuela, RB') | |
GDP2_2008=GDP %>% | |
dplyr::filter(Country.Name %in% addition)%>% | |
select(Country=Country.Name, GDP=X2008) | |
GDP2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong', | |
'Iran', 'Venezuela') | |
GDP_2008=rbind(GDP1_2008, GDP2_2008) | |
taipei=cbind(Country=c('Chinese Taipei', 'Kyrgyzstan', 'Russia', 'Slovakia', 'South Korea', | |
'North Korea'), | |
GDP=as.numeric(31900, 966.39, 11699.68, 18558.88, 20474.83, 1800)) | |
GDP_2008=rbind(GDP_2008, taipei) | |
output2008_bypopulationGDP=merge(x=output2008_bypopulation, y=GDP_2008, by='Country', all.x=T) | |
output2008_bypopulationGDP['Population']=as.numeric(output2008_bypopulationGDP$Population) | |
output2008_bypopulationGDP['GDP']=as.numeric(output2008_bypopulationGDP$GDP) | |
#GDP growth | |
GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F) | |
GDPgrowth1_2008=GDPgrowth %>% | |
dplyr::filter(Country.Name %in% output2008$Country)%>% | |
select(Country=Country.Name, GDPgrowth=X2008) | |
addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China', | |
'Iran, Islamic Rep.', | |
'Venezuela, RB') | |
GDPgrowth2_2008=GDPgrowth %>% | |
dplyr::filter(Country.Name %in% addition)%>% | |
select(Country=Country.Name, GDPgrowth=X2008) | |
GDPgrowth2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong', | |
'Iran', 'Venezuela') | |
GDPgrowth_2008=rbind(GDPgrowth1_2008, GDPgrowth2_2008) | |
taipei1=cbind(Country=c('Chinese Taipei', 'Russia', 'Kyrgyzstan','Slovakia', 'South Korea', 'North Korea'), | |
GDPgrowth=as.numeric(0.7, 8.4, 5.2, 5.4, 2.8, 3.1)) | |
GDPgrowth_2008=rbind(GDPgrowth_2008, taipei1) | |
output2008_bypopulationGDP1=merge(x=output2008_bypopulationGDP, y=GDPgrowth_2008, by='Country', all.x=T) | |
output2008_bypopulationGDP1$GDPgrowth=as.numeric(output2008_bypopulationGDP1$GDPgrowth) | |
#life expectency | |
life=read.csv('life1.csv', header = T, stringsAsFactors = F) | |
trim.leading <- function (x) sub("^\\s+", "", x) | |
life$Country=trim.leading(life$Country) | |
life_2008=life %>% | |
dplyr::filter(Country %in% output2008$Country)%>% | |
select(Country, Life) | |
merge(x=output2008_bypopulationGDP1,y=life_2008, by='Country', all.x=T)%>% | |
select(Country, Life) | |
addition=c('United Kingdom') | |
life1_2008=life %>% | |
dplyr::filter(Country %in% addition)%>% | |
select(Country, Life) | |
life1_2008$Country=c('Great Britain') | |
Taibei=cbind(Country=c('Chinese Taipei', 'Bahamas', 'Netherlands', | |
'North Korea', 'South Korea'), | |
Life=as.numeric(77.76, 74.25, 80.25, 68.49, 79.83)) | |
life2_2008=rbind(life_2008,life1_2008,Taibei) | |
output2008_bypopulationGDP2=merge(x=output2008_bypopulationGDP1, | |
y=life2_2008, by='Country', all.x=T) | |
output2008_bypopulationGDP2$Life=as.numeric(output2008_bypopulationGDP2$Life) | |
#summary(output2008_bypopulationGDP2) | |
df=select(output2008_bypopulationGDP2,Country, Total, Population,GDP, GDPgrowth, Life) | |
chart.Correlation(df[,-1], histogram=TRUE, pch=19) | |
# kmeans model | |
df$Population=df$Population/1000 | |
df$GDP=df$GDP/1000 | |
df=df[!df$Country=='Zimbabwe',] | |
ddf=scale(df[,-1]) | |
set.seed(1234) | |
fviz_nbclust(ddf, kmeans, method = "silhouette") | |
set.seed(1234) | |
fit.km <- kmeans(ddf, 5, nstart=25) | |
fviz_cluster(fit.km, data = ddf, geom = "point", | |
stand = FALSE, frame.type = "norm") | |
df$cluster <- as.factor(fit.km$cluster) | |
ddf6=merge(x=df, y=select(output2008_bypopulationGDP2[!output2008_bypopulationGDP2$Country=='Zimbabwe',], | |
Country, Total), by='Country', | |
all = T) | |
g3=ggplot(data=ddf6, aes(x=reorder(cluster, Total), y=Total, color=cluster))+ | |
geom_boxplot()+ | |
xlab('cluster')+ | |
ylab('total medals of each country')+ | |
ggtitle('cluster vs total medals')+ | |
theme_bw()+ | |
theme(legend.position='none') | |
g4=ggplot(ddf6, aes(x=reorder(cluster, Population), y=Population, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw()+ | |
theme(legend.position='none') | |
g5=ggplot(ddf6, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw()+ | |
theme(legend.position='none') | |
g8=ggplot(ddf6, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw()+ | |
theme(legend.position='none') | |
g10=ggplot(ddf6, aes(x=reorder(cluster, Life), y=Life, color=cluster))+ | |
geom_point()+ | |
geom_boxplot()+ | |
xlab('Cluster')+ | |
theme_bw()+ | |
theme(legend.position='none') | |
multiplot(g3, g4,g5,g8,g10, cols=2) | |
ddf8=filter(df, Country %in% df8$Country) | |
g1=ggplot(ddf8, aes(Total,Population, color =cluster)) + | |
geom_point()+ | |
geom_text_repel(aes(label=Country))+ | |
theme_classic()+ | |
theme_bw() | |
g2=ggplot(ddf8, aes(Total,GDP, color =cluster)) + | |
geom_point()+ | |
geom_text_repel(aes(label=Country))+ | |
theme_classic()+ | |
theme_bw() | |
g7=ggplot(ddf8, aes(Total,GDPgrowth, color =cluster)) + | |
geom_point()+ | |
geom_text_repel(aes(label=Country))+ | |
theme_classic()+ | |
theme_bw() | |
g9=ggplot(ddf8, aes(Total,Life, color =cluster)) + | |
geom_point()+ | |
geom_text_repel(aes(label=Country))+ | |
theme_classic()+ | |
theme_bw() | |
multiplot(g1,g2, g7, g9,cols = 2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment