shuozhang1985 · August 22, 2016 00:23
diff --git a/analysis of medal-winning b/analysis of medal-winning
 setwd("~/Desktop/web scraping")
 library(dplyr)
 library(dygraphs)
 library(reshape2)
 library(ggplot2)
 library(plotly)
 library(car)
 library(TSA)
 library(forecast)
 library(xts)
 library(Rmisc)
 library(NbClust)
 library(cluster)
 library(factoextra)
 library(ggrepel)
 library(corrplot)
 library(PerformanceAnalytics)
 output=read.csv('output.txt', header = T, stringsAsFactors = F)
 output2012=filter(output, year==2012)
 #summary(output)
 #sapply(output[,c('Gold', 'Silver', 'Bronze', 'Total')], sd)

 # pick the top countries by gold and total
 output_bycountry=output%>%
  dplyr::group_by(Country)%>%
  summarise(totoalgold=sum(Gold), avggold=mean(Gold))%>%
  arrange(desc(avggold))
 head(output_bycountry,20)
 # Country totoalgold   avggold
 # <chr>      <int>     <dbl>
 #   1   Unified Team         45 45.000000
 # 2   Soviet Union        395 43.888889
 # 3  United States        990 36.666667 (1)
 # 4   East Germany        153 30.600000
 # 5          China        201 25.125000 (2)
 # 6         Russia        134 19.142857 (3)
 # 7        Germany        213 11.210526 (4)
 # 8   West Germany         56 11.200000
 # 9  Great Britain        245  8.750000 (5)
 # 10         Italy        211  8.115385 (6)
 # 11        France        222  7.928571 (7)
 # 12       Ukraine         33  6.600000
 # 13       Hungary        170  6.538462 (8)
 # 14         Japan        130  6.500000 (9)
 # 15        Sweden        144  5.538462 (10)
 # 16   South Korea         81  5.400000 (11)
 # 17     Australia        138  5.307692 (12)
 # 18          Cuba         71  5.071429
 # 19       Romania         88  4.888889
 # 20       Finland        106  4.240000
 output_bycountry2=output%>%
  dplyr::group_by(Country)%>%
  summarise(totoalmetal=sum(Total), avgmetal=mean(Total))%>%
  arrange(desc(avgmetal))
 head(output_bycountry2,20)
 # Country totoalmetal  avgmetal
 # <chr>       <int>     <dbl>
 #   1   Soviet Union        1010 112.22222
 # 2   Unified Team         112 112.00000
 # 3  United States        2425  89.81481 (1)
 # 4   East Germany         409  81.80000
 # 5          China         473  59.12500  (2)
 # 6         Russia         405  57.85714 (3)
 # 7   West Germany         204  40.80000
 # 8        Germany         728  38.31579 (4)
 # 9  Great Britain         811  28.96429 (5)
 # 10        France         727  25.96429 (6)
 # 11       Ukraine         115  23.00000
 # 12         Italy         577  22.19231 (7)
 # 13         Japan         400  20.00000 (8)
 # 14        Sweden         498  19.15385 (9)
 # 15       Hungary         490  18.84615 (10)
 # 16     Australia         475  18.26923 (11)
 # 17       Romania         301  16.72222
 # 18   South Korea         243  16.20000 (12)
 # 19       Belarus          75  15.00000
 # 20          Cuba         202  14.42857

 # pick the top countries
 top10=dplyr::filter(output, Country %in% c('United States','China', 'Great Britain',
                                          'Russia', 'South Korea', 'Germany',
                                          'France', 'Italy', 'Hungary', 
                                          'Japan', 'Australia', 'Brazil', 'Sweden'))

 # plot dygraphs
 df2=dcast(top10[,c('Country', 'Total', 'year')], year~ Country,value.var="Total")
 color=c('#dddfd4', '#fae596', '#3fb0ac', '#173e43', '#98dafc', '#e62739', '#9068be',
        '#dbc3d0', '#6a5750', '#e05915', '#bccbde', '#300032', '#6534ff')
 dygraph(df2) %>%
  dyOptions(colors = color) %>%
  dyRangeSelector()

 # ggplot of  boxplot
 ggplot(data=top10, aes(x=reorder(Country,Total), y=Total, color=Country))+
  geom_boxplot()+  
  xlab('Country')+
  ylab('Number of total medals')+
  ggtitle('Medal distribution by top leading countries of the 28 summer Olympics')+
  theme_bw()+
  theme(legend.position='none', axis.text.x = element_text(angle = 45, hjust = 1))

 # population
 population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)
 summary(population)

 population1_2012=population %>%
  dplyr::filter(Place %in% output2012$Country)%>%
  dplyr::select(Country=Place, Population=X2012)
 population2_2012=population %>%
  dplyr::filter(Place %in%          c('China, Hong Kong SAR',
                                    'United Kingdom',
                                    'Iran (Islamic Republic of)',
                                    'Republic of Moldova',
                                    "Dem. People's Republic of Korea",
                                    'Russian Federation',
                                    'Republic of Korea',
                                    'United States of America',
                                    'Venezuela (Bolivarian Republic of)')) %>%
  dplyr::select(Country=Place, Population=X2012)
 population2_2012$Country=c('Hong Kong','North Korea','South Korea',
                           'Iran', 'Moldova', 
                           'Russia', 'Great Britain',
                           'Venezuela', 'United States')
 population_2012=rbind(population1_2012, population2_2012)
 taibei=cbind(Country=c('Chinese Taipei'), Population=as.numeric(23234.940))
 population_2012=rbind(population_2012, taibei)
 output2012_bypopulation=merge(x=output2012, y=population_2012, by='Country', all.x=T)


 #GDP
 GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)
 GDP1_2012=GDP %>%
  dplyr::filter(Country.Name %in% output2012$Country)%>%
  dplyr::select(Country=Country.Name, GDP=X2012)

 merge(x=output2012, y=GDP1_2012, by='Country', all.x=T)
 addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
  'Iran, Islamic Rep.',  'Russian Federation',
  'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
 GDP2_2012=GDP %>%
  dplyr::filter(Country.Name %in% addition)%>%
  dplyr::select(Country=Country.Name, GDP=X2012)
 GDP2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
  'Iran', 'South Korea', 'Russia', 'Slovakia', 'Venezuela')
 GDP_2012=rbind(GDP1_2012, GDP2_2012)
 taipei=cbind(Country=c('Chinese Taipei', 'North Korea'), GDP=as.numeric(42201.09, 1800))	
 GDP_2012=rbind(GDP_2012, taipei)
 output2012_bypopulationGDP=merge(x=output2012_bypopulation, y=GDP_2012, by='Country', all.x=T)
 output2012_bypopulationGDP['Population']=as.numeric(output2012_bypopulationGDP$Population)
 output2012_bypopulationGDP['GDP']=as.numeric(output2012_bypopulationGDP$GDP)

 #GDP growth
 GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)
 GDPgrowth1_2012=GDPgrowth %>%
  dplyr::filter(Country.Name %in% output2012$Country)%>%
  dplyr::select(Country=Country.Name, GDPgrowth=X2012)
 addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
           'Iran, Islamic Rep.', 'Russian Federation',
           'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
 GDPgrowth2_2012=GDPgrowth %>%
  dplyr::filter(Country.Name %in% addition)%>%
  dplyr::select(Country=Country.Name, GDPgrowth=X2012)

 GDPgrowth2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
                    'Iran','South Korea', 'Russia', 'Slovakia', 'Venezuela')
 GDPgrowth_2012=rbind(GDPgrowth1_2012, GDPgrowth2_2012)
 taipei1=cbind(Country=c('Chinese Taipei', 'North Korea'), GDPgrowth=as.numeric(2.06, 1.3))	
 GDPgrowth_2012=rbind(GDPgrowth_2012, taipei1)
 output2012_bypopulationGDP1=merge(x=output2012_bypopulationGDP, y=GDPgrowth_2012, by='Country', all.x=T)
 output2012_bypopulationGDP1$GDPgrowth=as.numeric(output2012_bypopulationGDP1$GDPgrowth)

 #life expectency
 life=read.csv('life.csv', header = T, stringsAsFactors = F)
 trim.leading <- function (x)  sub("^\\s+", "", x)
 life$Country=trim.leading(life$Country)
 life_2012=life %>%
  dplyr::filter(Country %in% output2012$Country)%>%
  dplyr::select(Country, Life)
 merge(x=output2012_bypopulationGDP1,y=life_2012, by='Country', all.x=T)%>%
  dplyr::select(Country, Life)
 addition=c('The Bahamas', 'United Kingdom')
 life1_2012=life %>%
  dplyr::filter(Country %in% addition)%>%
  dplyr::select(Country, Life)
 life1_2012$Country=c('Great Britain', 'Bahamas')
 Taibei=cbind(Country=c('Chinese Taipei', 'Montenegro'), 
             Life=as.numeric(79.5, 74.65))
 life2_2012=rbind(life_2012,life1_2012,Taibei)
 output2012_bypopulationGDP2=merge(x=output2012_bypopulationGDP1,
                                  y=life2_2012, by='Country', all.x=T)
 output2012_bypopulationGDP2$Life=as.numeric(output2012_bypopulationGDP2$Life)
 summary(output2012_bypopulationGDP2)

 #kmeans model
 df=dplyr::select(output2012_bypopulationGDP2,Country,Total, Population,GDP, GDPgrowth, Life)
 chart.Correlation(df[,-1], histogram=TRUE, pch=19)
 df$Population=df$Population/1000
 df$GDP=df$GDP/1000
 ddf=scale(df[,-1])
 #d = dist(as.data.frame(scale(df[,-1])))
 #fit.average =  hclust(d, method = "complete")
 # par(mfrow = c(1, 1))
 # plot(fit.average, hang = -1, main = "Dendrogram of Average Linkage\n5 Clusters")
 # rect.hclust(fit.average, k = 5)
 set.seed(1234)
 fviz_nbclust(ddf, kmeans, method = "silhouette")
 # nc <- NbClust(ddf, min.nc=2, max.nc=15, method="kmeans")
 # table(nc$Best.n[1,])
 # barplot(table(nc$Best.n[1,]),
 #         xlab="Numer of Clusters", ylab="Number of Criteria")
 set.seed(1234)
 fit.km <- kmeans(ddf, 5, nstart=25)
 fviz_cluster(fit.km, data = ddf, geom = "point",
             stand = FALSE, frame.type = "norm")
 # fit.km$size
 # fit.km$centers
 # aggregate(df[,-1], by=list(cluster=fit.km$cluster), mean)

 df$cluster <- as.factor(fit.km$cluster)


 df1=df%>%
  dplyr::filter(cluster==1)%>%
  arrange(desc(Total))

 df2=df%>%
  dplyr::filter(cluster==2)%>%
  arrange(desc(Total))

 df3=df%>%
  dplyr::filter(cluster==3)%>%
  arrange(desc(Total))

 df4=df%>%
  dplyr::filter(cluster==4)%>%
  arrange(desc(Total))

 df5=df%>%
  dplyr::filter(cluster==5)%>%
  arrange(desc(Total))


 df8=rbind(df1, head(df4,5), head(df3, 5),
          head(df2, 5),
          df5)

 g1=ggplot(df8, aes(Total,Population, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
 g2=ggplot(df8, aes(Total,GDP, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
 g7=ggplot(df8, aes(Total,GDPgrowth, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
 g9=ggplot(df8, aes(Total,Life, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()

 multiplot(g1,g2, g7, g9,cols = 2)

 g3=ggplot(df, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
   geom_point()+
   geom_boxplot()+
   xlab('Cluster')+
   theme_bw()
 g4=ggplot(df, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()
 g5=ggplot(df, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()
 g8=ggplot(df, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()
 g10=ggplot(df, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()
 multiplot(g3,g4,g5,g8,g10, cols=2)

 # 2008
 output=read.csv('output.txt', header = T, stringsAsFactors = F)
 output2008=filter(output, year==2008)

 # population
 population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)

 population1_2008=population %>%
  dplyr::filter(Place %in% output2008$Country)%>%
  select(Country=Place, Population=X2008)

 population2_2008=population %>%
  dplyr::filter(Place %in%          c('China, Hong Kong SAR',
                                      'United Kingdom',
                                      'Iran (Islamic Republic of)',
                                      'Republic of Moldova',
                                      "Dem. People's Republic of Korea",
                                      'Russian Federation',
                                      'Republic of Korea',
                                      'United States of America',
                                      'Venezuela (Bolivarian Republic of)')) %>%
  select(Country=Place, Population=X2008)
 population2_2008$Country=c('Hong Kong','North Korea','South Korea',
                           'Iran', 'Moldova', 
                           'Russia', 'Great Britain', 
                           'Venezuela', 'United States')
 population_2008=rbind(population1_2008, population2_2008)
 taibei=cbind(Country=c('Chinese Taipei', 'Vietnam'), Population=as.numeric(23040, 85120))
 population_2008=rbind(population_2008, taibei)
 output2008_bypopulation=merge(x=output2008, y=population_2008, by='Country', all.x=T)


 #GDP
 GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)

 GDP1_2008=GDP %>%
  dplyr::filter(Country.Name %in% output2008$Country)%>%
  select(Country=Country.Name, GDP=X2008)
 addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
           'Iran, Islamic Rep.', 
             'Venezuela, RB')
 GDP2_2008=GDP %>%
  dplyr::filter(Country.Name %in% addition)%>%
  select(Country=Country.Name, GDP=X2008)

 GDP2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
                    'Iran', 'Venezuela')
 GDP_2008=rbind(GDP1_2008, GDP2_2008)
 taipei=cbind(Country=c('Chinese Taipei', 'Kyrgyzstan', 'Russia', 'Slovakia', 'South Korea',
                       'North Korea'), 
             GDP=as.numeric(31900, 966.39, 11699.68, 18558.88, 20474.83, 1800))	
 GDP_2008=rbind(GDP_2008, taipei)
 output2008_bypopulationGDP=merge(x=output2008_bypopulation, y=GDP_2008, by='Country', all.x=T)
 output2008_bypopulationGDP['Population']=as.numeric(output2008_bypopulationGDP$Population)
 output2008_bypopulationGDP['GDP']=as.numeric(output2008_bypopulationGDP$GDP)

 #GDP growth
 GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)

 GDPgrowth1_2008=GDPgrowth %>%
  dplyr::filter(Country.Name %in% output2008$Country)%>%
  select(Country=Country.Name, GDPgrowth=X2008)
 addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
           'Iran, Islamic Rep.', 
             'Venezuela, RB')
 GDPgrowth2_2008=GDPgrowth %>%
  dplyr::filter(Country.Name %in% addition)%>%
  select(Country=Country.Name, GDPgrowth=X2008)

 GDPgrowth2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
                          'Iran',   'Venezuela')
 GDPgrowth_2008=rbind(GDPgrowth1_2008, GDPgrowth2_2008)
 taipei1=cbind(Country=c('Chinese Taipei', 'Russia', 'Kyrgyzstan','Slovakia', 'South Korea', 'North Korea'),
              GDPgrowth=as.numeric(0.7, 8.4, 5.2, 5.4, 2.8, 3.1))	
 GDPgrowth_2008=rbind(GDPgrowth_2008, taipei1)
 output2008_bypopulationGDP1=merge(x=output2008_bypopulationGDP, y=GDPgrowth_2008, by='Country', all.x=T)
 output2008_bypopulationGDP1$GDPgrowth=as.numeric(output2008_bypopulationGDP1$GDPgrowth)

 #life expectency
 life=read.csv('life1.csv', header = T, stringsAsFactors = F)
 trim.leading <- function (x)  sub("^\\s+", "", x)
 life$Country=trim.leading(life$Country)
 life_2008=life %>%
  dplyr::filter(Country %in% output2008$Country)%>%
  select(Country, Life)
 merge(x=output2008_bypopulationGDP1,y=life_2008, by='Country', all.x=T)%>%
  select(Country, Life)
 addition=c('United Kingdom')
 life1_2008=life %>%
  dplyr::filter(Country %in% addition)%>%
  select(Country, Life)
 life1_2008$Country=c('Great Britain')
 Taibei=cbind(Country=c('Chinese Taipei', 'Bahamas', 'Netherlands',
                       'North Korea', 'South Korea'),
             Life=as.numeric(77.76, 74.25, 80.25, 68.49, 79.83))
 life2_2008=rbind(life_2008,life1_2008,Taibei)
 output2008_bypopulationGDP2=merge(x=output2008_bypopulationGDP1,
                                  y=life2_2008, by='Country', all.x=T)
 output2008_bypopulationGDP2$Life=as.numeric(output2008_bypopulationGDP2$Life)
 #summary(output2008_bypopulationGDP2)

 df=select(output2008_bypopulationGDP2,Country, Total, Population,GDP, GDPgrowth, Life)
 chart.Correlation(df[,-1], histogram=TRUE, pch=19)

 # kmeans model
 df$Population=df$Population/1000
 df$GDP=df$GDP/1000
 df=df[!df$Country=='Zimbabwe',]
 ddf=scale(df[,-1])
 set.seed(1234)
 fviz_nbclust(ddf, kmeans, method = "silhouette")
 set.seed(1234)
 fit.km <- kmeans(ddf, 5, nstart=25)
 fviz_cluster(fit.km, data = ddf, geom = "point",
             stand = FALSE, frame.type = "norm")
 df$cluster <- as.factor(fit.km$cluster)
 ddf6=merge(x=df, y=select(output2008_bypopulationGDP2[!output2008_bypopulationGDP2$Country=='Zimbabwe',],
                          Country, Total), by='Country', 
          all = T)
 g3=ggplot(data=ddf6, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
  geom_boxplot()+
  xlab('cluster')+
  ylab('total medals of each country')+
  ggtitle('cluster vs total medals')+
  theme_bw()+
  theme(legend.position='none')

 g4=ggplot(ddf6, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()+
  theme(legend.position='none')
 g5=ggplot(ddf6, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()+
  theme(legend.position='none')
 g8=ggplot(ddf6, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()+
  theme(legend.position='none')
 g10=ggplot(ddf6, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
  geom_point()+
  geom_boxplot()+
  xlab('Cluster')+
  theme_bw()+
  theme(legend.position='none')
 multiplot(g3, g4,g5,g8,g10, cols=2)

 ddf8=filter(df, Country %in% df8$Country)
 g1=ggplot(ddf8, aes(Total,Population, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
 g2=ggplot(ddf8, aes(Total,GDP, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
 g7=ggplot(ddf8, aes(Total,GDPgrowth, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()
 g9=ggplot(ddf8, aes(Total,Life, color =cluster)) +
  geom_point()+
  geom_text_repel(aes(label=Country))+
  theme_classic()+
  theme_bw()

 multiplot(g1,g2, g7, g9,cols = 2)
	setwd("~/Desktop/web scraping")
	library(dplyr)
	library(dygraphs)
	library(reshape2)
	library(ggplot2)
	library(plotly)
	library(car)
	library(TSA)
	library(forecast)
	library(xts)
	library(Rmisc)
	library(NbClust)
	library(cluster)
	library(factoextra)
	library(ggrepel)
	library(corrplot)
	library(PerformanceAnalytics)
	output=read.csv('output.txt', header = T, stringsAsFactors = F)
	output2012=filter(output, year==2012)
	#summary(output)
	#sapply(output[,c('Gold', 'Silver', 'Bronze', 'Total')], sd)

	# pick the top countries by gold and total
	output_bycountry=output%>%
	dplyr::group_by(Country)%>%
	summarise(totoalgold=sum(Gold), avggold=mean(Gold))%>%
	arrange(desc(avggold))
	head(output_bycountry,20)
	# Country totoalgold avggold
	# <chr> <int> <dbl>
	# 1 Unified Team 45 45.000000
	# 2 Soviet Union 395 43.888889
	# 3 United States 990 36.666667 (1)
	# 4 East Germany 153 30.600000
	# 5 China 201 25.125000 (2)
	# 6 Russia 134 19.142857 (3)
	# 7 Germany 213 11.210526 (4)
	# 8 West Germany 56 11.200000
	# 9 Great Britain 245 8.750000 (5)
	# 10 Italy 211 8.115385 (6)
	# 11 France 222 7.928571 (7)
	# 12 Ukraine 33 6.600000
	# 13 Hungary 170 6.538462 (8)
	# 14 Japan 130 6.500000 (9)
	# 15 Sweden 144 5.538462 (10)
	# 16 South Korea 81 5.400000 (11)
	# 17 Australia 138 5.307692 (12)
	# 18 Cuba 71 5.071429
	# 19 Romania 88 4.888889
	# 20 Finland 106 4.240000
	output_bycountry2=output%>%
	dplyr::group_by(Country)%>%
	summarise(totoalmetal=sum(Total), avgmetal=mean(Total))%>%
	arrange(desc(avgmetal))
	head(output_bycountry2,20)
	# Country totoalmetal avgmetal
	# <chr> <int> <dbl>
	# 1 Soviet Union 1010 112.22222
	# 2 Unified Team 112 112.00000
	# 3 United States 2425 89.81481 (1)
	# 4 East Germany 409 81.80000
	# 5 China 473 59.12500 (2)
	# 6 Russia 405 57.85714 (3)
	# 7 West Germany 204 40.80000
	# 8 Germany 728 38.31579 (4)
	# 9 Great Britain 811 28.96429 (5)
	# 10 France 727 25.96429 (6)
	# 11 Ukraine 115 23.00000
	# 12 Italy 577 22.19231 (7)
	# 13 Japan 400 20.00000 (8)
	# 14 Sweden 498 19.15385 (9)
	# 15 Hungary 490 18.84615 (10)
	# 16 Australia 475 18.26923 (11)
	# 17 Romania 301 16.72222
	# 18 South Korea 243 16.20000 (12)
	# 19 Belarus 75 15.00000
	# 20 Cuba 202 14.42857

	# pick the top countries
	top10=dplyr::filter(output, Country %in% c('United States','China', 'Great Britain',
	'Russia', 'South Korea', 'Germany',
	'France', 'Italy', 'Hungary',
	'Japan', 'Australia', 'Brazil', 'Sweden'))

	# plot dygraphs
	df2=dcast(top10[,c('Country', 'Total', 'year')], year~ Country,value.var="Total")
	color=c('#dddfd4', '#fae596', '#3fb0ac', '#173e43', '#98dafc', '#e62739', '#9068be',
	'#dbc3d0', '#6a5750', '#e05915', '#bccbde', '#300032', '#6534ff')
	dygraph(df2) %>%
	dyOptions(colors = color) %>%
	dyRangeSelector()

	# ggplot of boxplot
	ggplot(data=top10, aes(x=reorder(Country,Total), y=Total, color=Country))+
	geom_boxplot()+
	xlab('Country')+
	ylab('Number of total medals')+
	ggtitle('Medal distribution by top leading countries of the 28 summer Olympics')+
	theme_bw()+
	theme(legend.position='none', axis.text.x = element_text(angle = 45, hjust = 1))

	# population
	population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)
	summary(population)

	population1_2012=population %>%
	dplyr::filter(Place %in% output2012$Country)%>%
	dplyr::select(Country=Place, Population=X2012)
	population2_2012=population %>%
	dplyr::filter(Place %in% c('China, Hong Kong SAR',
	'United Kingdom',
	'Iran (Islamic Republic of)',
	'Republic of Moldova',
	"Dem. People's Republic of Korea",
	'Russian Federation',
	'Republic of Korea',
	'United States of America',
	'Venezuela (Bolivarian Republic of)')) %>%
	dplyr::select(Country=Place, Population=X2012)
	population2_2012$Country=c('Hong Kong','North Korea','South Korea',
	'Iran', 'Moldova',
	'Russia', 'Great Britain',
	'Venezuela', 'United States')
	population_2012=rbind(population1_2012, population2_2012)
	taibei=cbind(Country=c('Chinese Taipei'), Population=as.numeric(23234.940))
	population_2012=rbind(population_2012, taibei)
	output2012_bypopulation=merge(x=output2012, y=population_2012, by='Country', all.x=T)


	#GDP
	GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)
	GDP1_2012=GDP %>%
	dplyr::filter(Country.Name %in% output2012$Country)%>%
	dplyr::select(Country=Country.Name, GDP=X2012)

	merge(x=output2012, y=GDP1_2012, by='Country', all.x=T)
	addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
	'Iran, Islamic Rep.', 'Russian Federation',
	'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
	GDP2_2012=GDP %>%
	dplyr::filter(Country.Name %in% addition)%>%
	dplyr::select(Country=Country.Name, GDP=X2012)
	GDP2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
	'Iran', 'South Korea', 'Russia', 'Slovakia', 'Venezuela')
	GDP_2012=rbind(GDP1_2012, GDP2_2012)
	taipei=cbind(Country=c('Chinese Taipei', 'North Korea'), GDP=as.numeric(42201.09, 1800))
	GDP_2012=rbind(GDP_2012, taipei)
	output2012_bypopulationGDP=merge(x=output2012_bypopulation, y=GDP_2012, by='Country', all.x=T)
	output2012_bypopulationGDP['Population']=as.numeric(output2012_bypopulationGDP$Population)
	output2012_bypopulationGDP['GDP']=as.numeric(output2012_bypopulationGDP$GDP)

	#GDP growth
	GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)
	GDPgrowth1_2012=GDPgrowth %>%
	dplyr::filter(Country.Name %in% output2012$Country)%>%
	dplyr::select(Country=Country.Name, GDPgrowth=X2012)
	addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
	'Iran, Islamic Rep.', 'Russian Federation',
	'Slovak Republic', 'Korea, Rep.', 'Venezuela, RB')
	GDPgrowth2_2012=GDPgrowth %>%
	dplyr::filter(Country.Name %in% addition)%>%
	dplyr::select(Country=Country.Name, GDPgrowth=X2012)

	GDPgrowth2_2012$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
	'Iran','South Korea', 'Russia', 'Slovakia', 'Venezuela')
	GDPgrowth_2012=rbind(GDPgrowth1_2012, GDPgrowth2_2012)
	taipei1=cbind(Country=c('Chinese Taipei', 'North Korea'), GDPgrowth=as.numeric(2.06, 1.3))
	GDPgrowth_2012=rbind(GDPgrowth_2012, taipei1)
	output2012_bypopulationGDP1=merge(x=output2012_bypopulationGDP, y=GDPgrowth_2012, by='Country', all.x=T)
	output2012_bypopulationGDP1$GDPgrowth=as.numeric(output2012_bypopulationGDP1$GDPgrowth)

	#life expectency
	life=read.csv('life.csv', header = T, stringsAsFactors = F)
	trim.leading <- function (x) sub("^\\s+", "", x)
	life$Country=trim.leading(life$Country)
	life_2012=life %>%
	dplyr::filter(Country %in% output2012$Country)%>%
	dplyr::select(Country, Life)
	merge(x=output2012_bypopulationGDP1,y=life_2012, by='Country', all.x=T)%>%
	dplyr::select(Country, Life)
	addition=c('The Bahamas', 'United Kingdom')
	life1_2012=life %>%
	dplyr::filter(Country %in% addition)%>%
	dplyr::select(Country, Life)
	life1_2012$Country=c('Great Britain', 'Bahamas')
	Taibei=cbind(Country=c('Chinese Taipei', 'Montenegro'),
	Life=as.numeric(79.5, 74.65))
	life2_2012=rbind(life_2012,life1_2012,Taibei)
	output2012_bypopulationGDP2=merge(x=output2012_bypopulationGDP1,
	y=life2_2012, by='Country', all.x=T)
	output2012_bypopulationGDP2$Life=as.numeric(output2012_bypopulationGDP2$Life)
	summary(output2012_bypopulationGDP2)

	#kmeans model
	df=dplyr::select(output2012_bypopulationGDP2,Country,Total, Population,GDP, GDPgrowth, Life)
	chart.Correlation(df[,-1], histogram=TRUE, pch=19)
	df$Population=df$Population/1000
	df$GDP=df$GDP/1000
	ddf=scale(df[,-1])
	#d = dist(as.data.frame(scale(df[,-1])))
	#fit.average = hclust(d, method = "complete")
	# par(mfrow = c(1, 1))
	# plot(fit.average, hang = -1, main = "Dendrogram of Average Linkage\n5 Clusters")
	# rect.hclust(fit.average, k = 5)
	set.seed(1234)
	fviz_nbclust(ddf, kmeans, method = "silhouette")
	# nc <- NbClust(ddf, min.nc=2, max.nc=15, method="kmeans")
	# table(nc$Best.n[1,])
	# barplot(table(nc$Best.n[1,]),
	# xlab="Numer of Clusters", ylab="Number of Criteria")
	set.seed(1234)
	fit.km <- kmeans(ddf, 5, nstart=25)
	fviz_cluster(fit.km, data = ddf, geom = "point",
	stand = FALSE, frame.type = "norm")
	# fit.km$size
	# fit.km$centers
	# aggregate(df[,-1], by=list(cluster=fit.km$cluster), mean)

	df$cluster <- as.factor(fit.km$cluster)


	df1=df%>%
	dplyr::filter(cluster==1)%>%
	arrange(desc(Total))

	df2=df%>%
	dplyr::filter(cluster==2)%>%
	arrange(desc(Total))

	df3=df%>%
	dplyr::filter(cluster==3)%>%
	arrange(desc(Total))

	df4=df%>%
	dplyr::filter(cluster==4)%>%
	arrange(desc(Total))

	df5=df%>%
	dplyr::filter(cluster==5)%>%
	arrange(desc(Total))


	df8=rbind(df1, head(df4,5), head(df3, 5),
	head(df2, 5),
	df5)

	g1=ggplot(df8, aes(Total,Population, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g2=ggplot(df8, aes(Total,GDP, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g7=ggplot(df8, aes(Total,GDPgrowth, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g9=ggplot(df8, aes(Total,Life, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()

	multiplot(g1,g2, g7, g9,cols = 2)

	g3=ggplot(df, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	g4=ggplot(df, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	g5=ggplot(df, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	g8=ggplot(df, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	g10=ggplot(df, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()
	multiplot(g3,g4,g5,g8,g10, cols=2)

	# 2008
	output=read.csv('output.txt', header = T, stringsAsFactors = F)
	output2008=filter(output, year==2008)

	# population
	population=read.csv('population_both_gender.csv', header = T, stringsAsFactors = F)

	population1_2008=population %>%
	dplyr::filter(Place %in% output2008$Country)%>%
	select(Country=Place, Population=X2008)

	population2_2008=population %>%
	dplyr::filter(Place %in% c('China, Hong Kong SAR',
	'United Kingdom',
	'Iran (Islamic Republic of)',
	'Republic of Moldova',
	"Dem. People's Republic of Korea",
	'Russian Federation',
	'Republic of Korea',
	'United States of America',
	'Venezuela (Bolivarian Republic of)')) %>%
	select(Country=Place, Population=X2008)
	population2_2008$Country=c('Hong Kong','North Korea','South Korea',
	'Iran', 'Moldova',
	'Russia', 'Great Britain',
	'Venezuela', 'United States')
	population_2008=rbind(population1_2008, population2_2008)
	taibei=cbind(Country=c('Chinese Taipei', 'Vietnam'), Population=as.numeric(23040, 85120))
	population_2008=rbind(population_2008, taibei)
	output2008_bypopulation=merge(x=output2008, y=population_2008, by='Country', all.x=T)


	#GDP
	GDP=read.csv('GDP.csv', header = T, stringsAsFactors = F)

	GDP1_2008=GDP %>%
	dplyr::filter(Country.Name %in% output2008$Country)%>%
	select(Country=Country.Name, GDP=X2008)
	addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
	'Iran, Islamic Rep.',
	'Venezuela, RB')
	GDP2_2008=GDP %>%
	dplyr::filter(Country.Name %in% addition)%>%
	select(Country=Country.Name, GDP=X2008)

	GDP2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
	'Iran', 'Venezuela')
	GDP_2008=rbind(GDP1_2008, GDP2_2008)
	taipei=cbind(Country=c('Chinese Taipei', 'Kyrgyzstan', 'Russia', 'Slovakia', 'South Korea',
	'North Korea'),
	GDP=as.numeric(31900, 966.39, 11699.68, 18558.88, 20474.83, 1800))
	GDP_2008=rbind(GDP_2008, taipei)
	output2008_bypopulationGDP=merge(x=output2008_bypopulation, y=GDP_2008, by='Country', all.x=T)
	output2008_bypopulationGDP['Population']=as.numeric(output2008_bypopulationGDP$Population)
	output2008_bypopulationGDP['GDP']=as.numeric(output2008_bypopulationGDP$GDP)

	#GDP growth
	GDPgrowth=read.csv('GDPgrowth.csv', header = T, stringsAsFactors = F)

	GDPgrowth1_2008=GDPgrowth %>%
	dplyr::filter(Country.Name %in% output2008$Country)%>%
	select(Country=Country.Name, GDPgrowth=X2008)
	addition=c('Bahamas, The', 'Egypt, Arab Rep.', 'United Kingdom', 'Hong Kong SAR, China',
	'Iran, Islamic Rep.',
	'Venezuela, RB')
	GDPgrowth2_2008=GDPgrowth %>%
	dplyr::filter(Country.Name %in% addition)%>%
	select(Country=Country.Name, GDPgrowth=X2008)

	GDPgrowth2_2008$Country=c('Bahamas','Egypt','Great Britain', 'Hong Kong',
	'Iran', 'Venezuela')
	GDPgrowth_2008=rbind(GDPgrowth1_2008, GDPgrowth2_2008)
	taipei1=cbind(Country=c('Chinese Taipei', 'Russia', 'Kyrgyzstan','Slovakia', 'South Korea', 'North Korea'),
	GDPgrowth=as.numeric(0.7, 8.4, 5.2, 5.4, 2.8, 3.1))
	GDPgrowth_2008=rbind(GDPgrowth_2008, taipei1)
	output2008_bypopulationGDP1=merge(x=output2008_bypopulationGDP, y=GDPgrowth_2008, by='Country', all.x=T)
	output2008_bypopulationGDP1$GDPgrowth=as.numeric(output2008_bypopulationGDP1$GDPgrowth)

	#life expectency
	life=read.csv('life1.csv', header = T, stringsAsFactors = F)
	trim.leading <- function (x) sub("^\\s+", "", x)
	life$Country=trim.leading(life$Country)
	life_2008=life %>%
	dplyr::filter(Country %in% output2008$Country)%>%
	select(Country, Life)
	merge(x=output2008_bypopulationGDP1,y=life_2008, by='Country', all.x=T)%>%
	select(Country, Life)
	addition=c('United Kingdom')
	life1_2008=life %>%
	dplyr::filter(Country %in% addition)%>%
	select(Country, Life)
	life1_2008$Country=c('Great Britain')
	Taibei=cbind(Country=c('Chinese Taipei', 'Bahamas', 'Netherlands',
	'North Korea', 'South Korea'),
	Life=as.numeric(77.76, 74.25, 80.25, 68.49, 79.83))
	life2_2008=rbind(life_2008,life1_2008,Taibei)
	output2008_bypopulationGDP2=merge(x=output2008_bypopulationGDP1,
	y=life2_2008, by='Country', all.x=T)
	output2008_bypopulationGDP2$Life=as.numeric(output2008_bypopulationGDP2$Life)
	#summary(output2008_bypopulationGDP2)

	df=select(output2008_bypopulationGDP2,Country, Total, Population,GDP, GDPgrowth, Life)
	chart.Correlation(df[,-1], histogram=TRUE, pch=19)

	# kmeans model
	df$Population=df$Population/1000
	df$GDP=df$GDP/1000
	df=df[!df$Country=='Zimbabwe',]
	ddf=scale(df[,-1])
	set.seed(1234)
	fviz_nbclust(ddf, kmeans, method = "silhouette")
	set.seed(1234)
	fit.km <- kmeans(ddf, 5, nstart=25)
	fviz_cluster(fit.km, data = ddf, geom = "point",
	stand = FALSE, frame.type = "norm")
	df$cluster <- as.factor(fit.km$cluster)
	ddf6=merge(x=df, y=select(output2008_bypopulationGDP2[!output2008_bypopulationGDP2$Country=='Zimbabwe',],
	Country, Total), by='Country',
	all = T)
	g3=ggplot(data=ddf6, aes(x=reorder(cluster, Total), y=Total, color=cluster))+
	geom_boxplot()+
	xlab('cluster')+
	ylab('total medals of each country')+
	ggtitle('cluster vs total medals')+
	theme_bw()+
	theme(legend.position='none')

	g4=ggplot(ddf6, aes(x=reorder(cluster, Population), y=Population, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()+
	theme(legend.position='none')
	g5=ggplot(ddf6, aes(x=reorder(cluster, GDP), y=GDP, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()+
	theme(legend.position='none')
	g8=ggplot(ddf6, aes(x=reorder(cluster, GDPgrowth), y=GDPgrowth, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()+
	theme(legend.position='none')
	g10=ggplot(ddf6, aes(x=reorder(cluster, Life), y=Life, color=cluster))+
	geom_point()+
	geom_boxplot()+
	xlab('Cluster')+
	theme_bw()+
	theme(legend.position='none')
	multiplot(g3, g4,g5,g8,g10, cols=2)

	ddf8=filter(df, Country %in% df8$Country)
	g1=ggplot(ddf8, aes(Total,Population, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g2=ggplot(ddf8, aes(Total,GDP, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g7=ggplot(ddf8, aes(Total,GDPgrowth, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()
	g9=ggplot(ddf8, aes(Total,Life, color =cluster)) +
	geom_point()+
	geom_text_repel(aes(label=Country))+
	theme_classic()+
	theme_bw()

	multiplot(g1,g2, g7, g9,cols = 2)