-
-
Save ramnathv/546738 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################## | |
##### Author: Diego Valle Jones | |
##### Website: www.diegovalle.net | |
##### Date Created: Mon Mar 01 18:51:27 2010 | |
######################################################## | |
#1. For what foods are Americans and Mexicans outliers | |
#2. Partition the data around medoids to classify the | |
#countries of the world according to what they eat | |
library(ggplot2) | |
library(maps) | |
library(maptools) | |
library(RColorBrewer) | |
library(Cairo) | |
library(cluster) | |
######################################################## | |
#Outliers | |
######################################################## | |
#diet<-read.csv("diet-forcsv-partial.csv") | |
diet<-read.csv("http://spreadsheets.google.com/pub?key=tdzqfp-_ypDqUNYnJEq8sgg&single=true&gid=0&output=csv") | |
#Plot the distributions of all the different foods | |
plotDensities <- function(df, name, color) { | |
cntrynum <- which(df$Countries == name) | |
#Only food columns | |
clmns <- 4:52 | |
mdiet <- melt(df[, c(1,clmns)], id = "Countries") | |
mdiet$variable <- sub(".day...2003", "", mdiet$variable) | |
us <- data.frame(value = unlist(df[cntrynum, clmns]), | |
variable = unique(mdiet$variable)) | |
ggplot(mdiet, aes(value)) + | |
geom_density() + | |
geom_vline(data =us, aes(xintercept = value), color = color) + | |
facet_wrap(~variable, scale = "free") + | |
xlab("") + opts(title = name) | |
} | |
plotDensities(diet, "United States of America", "blue") | |
#dev.print(png, "usa.png", width = 960, height = 600) | |
plotDensities(diet, "Mexico", "darkgreen") | |
#dev.print(png, "mx.png", width = 960, height = 600) | |
#Now get the outliers | |
diet.st <- diet | |
clmns <- 2:(ncol(diet.st)-1) | |
diet.st[ ,clmns] <- sapply(diet.st[ ,clmns], | |
function(x) (x - mean(x, na.action=na.omit)) / sd(x)) | |
diet.mx <- subset(diet.st, Countries=="Mexico" | | |
Countries=="United States of America") | |
diet.mx <- data.frame(t(diet.mx[2:(ncol(diet.mx)-1)])) | |
colnames(diet.mx) <- c("Mexico","US") | |
diet.mx$dif <- diet.mx$US - diet.mx$Mexico | |
outliers <- subset(diet.mx, US >= 2 | | |
Mexico >= 2 | | |
dif >= 2 | | |
US <= -2 | | |
Mexico <= -2 | | |
dif <= -2) | |
######################################################## | |
#CLuster Analysis | |
######################################################## | |
cleanMap <- function(df) { | |
world.map <- map_data("world") | |
world.map <- subset(world.map, region != "Antarctica") | |
world.map <- world.map[-grep("Sea|Lake", world.map$region),] | |
world.map <- world.map[-grep("Sea|Lake", world.map$subregion),] | |
df$country <- as.character(df$country) | |
df[which(df$country == "United States of America"),]$country <- "USA" | |
df[which(df$country == "United Kingdom"),]$country <- "UK" | |
df[which(df$country == "Russian Federation"),]$country <- "USSR" | |
p.map <- merge(world.map, df[c("country", "cluster")], | |
by.x = "region", | |
by.y ="country", | |
all.x = TRUE) | |
p.map <- p.map[order(p.map$order), ] | |
p.map | |
} | |
onlyFood <- function(vec){ | |
#col 52 is the last having to do with food | |
vec <- vec[ , 4:52] | |
vec | |
} | |
plotMap <- function(df, nclust){ | |
df.food <- onlyFood(df) | |
df.food$cluster <- pam(df.food, nclust)$cluster | |
df.food$country <- df$Countries | |
p.map <- cleanMap(df.food) | |
clr.map <- brewer.pal(nclust, "Set2") | |
ggplot(p.map, aes(long, lat)) + | |
geom_polygon(aes(group = group, fill = factor(cluster)), | |
color = "#C4C4C4", size = .2) + | |
scale_y_continuous(breaks = NA) + | |
scale_x_continuous(breaks = NA) + xlab("") + ylab("") + | |
scale_fill_manual("Cluster", values = c(clr.map, "#FEFEFE")) + | |
opts(panel.background = theme_rect(fill = "#e0f2ff", | |
colour = "white")) + | |
opts(title = "Cluster analysis of what the world eats") | |
} | |
nclust <- 6 | |
plotMap(diet.st, nclust) | |
#dev.print(png, "worlddiet.png", width = 900, height = 550) | |
cl <- pam(onlyFood(diet.st), nclust) | |
clusplot(pam(onlyFood(diet.st), nclust)) | |
med <- data.frame(t(cl$medoids)) | |
#write.csv(med, "centers.csv") | |
#Cairo(file="worlddiet.png", width = 900, height = 550) | |
# | |
#dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment