Last active
October 24, 2022 06:51
-
-
Save wraseman/f4be0414b1c791bd341082e2ef53b295 to your computer and use it in GitHub Desktop.
Code for figures used in my blog post on multivariate distance calculations: https://waterprogramming.wordpress.com/2018/07/23/multivariate-distances-mahalanobis-vs-euclidean/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: 'Multivariate Distances: Mahalanobis vs. Euclidean' | |
author: "Billy Raseman" | |
date: "July 12, 2018" | |
output: html_document | |
--- | |
Below is the code used to create the plots for https://waterprogramming.wordpress.com/2018/07/23/multivariate-distances-mahalanobis-vs-euclidean/. My apologies that it is not entirely reproducible. I found it a bit easier to post-process a few of the plots (particularly for the PCA part)! | |
```{r} | |
## initial setup | |
# clear environment | |
rm(list=ls()) | |
# load packages | |
library(tidyverse) | |
# save shortcut to remove axes ticks and text | |
axes_labels_only <- theme(axis.line=element_blank(), # remove axis | |
axis.text.x=element_blank(), | |
axis.text.y=element_blank(), | |
axis.ticks=element_blank(), panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank() | |
) | |
## cars data analysis and visualization | |
# consider only 4wd cars | |
cars <- filter(mpg, str_detect(model, '4wd')) %>% | |
select(displ, hwy) %>% | |
unique | |
# examine mpg: dataset about cars | |
p <- ggplot(data = cars, mapping = aes(x = displ, y = hwy)) + | |
xlab("Displacement") + ylab("Gas mileage (highway)") + | |
geom_point() | |
# let's visually compare my hypothetical Jeep and the rest of 'em | |
my.car <- filter(mpg, model=="grand cherokee 4wd", year==1999, cyl==8) %>% | |
select(displ, hwy) | |
# plot values | |
p + geom_point(data=my.car, color="red", size=2) + | |
geom_text(data=my.car, label="dream car!", color="red", vjust=-1) | |
# plot values without axes (visually standardize) | |
p + geom_point(data=my.car, color="red", size=2) + | |
geom_text(data=my.car, label="dream car!", color="red", vjust=-1) + | |
axes_labels_only | |
``` | |
## Plot standardized values | |
```{r} | |
# calculate Euclidean distance between my dream car and the other cars | |
other.cars <- filter(mpg, model!="grand cherokee 4wd" | year!=1999 | cyl!=8) %>% | |
filter(str_detect(model, '4wd')) %>% | |
select(displ, hwy) | |
# compute Euclidean distance between cars | |
compare.cars <- rbind(my.car, other.cars) %>% select(hwy, displ) %>% | |
unique | |
hwy.mean <- mean(compare.cars$hwy) | |
hwy.sd <- sd(compare.cars$hwy) | |
displ.mean <- mean(compare.cars$displ) | |
displ.sd <- sd(compare.cars$displ) | |
compare.cars <- mutate(compare.cars, | |
hwy.standardized = (hwy-hwy.mean)/hwy.sd, | |
displ.standardized = (displ-displ.mean)/displ.sd) | |
compare.cars <- cbind(id=1:nrow(compare.cars), compare.cars) | |
# as you can see, all that has changed is the scale of the axes | |
ggplot(data = compare.cars, mapping = aes(x = displ.standardized, y = hwy.standardized)) + | |
geom_point() + | |
xlab("Standardized Displacement") + ylab("Standardized Gas mileage (highway)") | |
``` | |
## Plot Euclidean Distance | |
```{r} | |
# calculate Euclidean distance | |
compare.cars$euc.dist <- (select(compare.cars, | |
hwy.standardized, | |
displ.standardized) %>% | |
dist %>% # calculate Euclidean distance between all cars | |
as.matrix)[,1] # just use first column (distance between my cars and rest) | |
compare.cars$euc.dist.order <- (rank(compare.cars$euc.dist) - 1) %>% as.integer | |
# Euclidean distance plot | |
ggplot(data=compare.cars, aes(x = displ.standardized, y = hwy.standardized, color = euc.dist)) + | |
geom_point() + | |
geom_text(aes(label = euc.dist.order), size = 3, hjust=0, vjust=1.5) + | |
xlab("Displacement") + ylab("Gas mileage (highway)") + | |
axes_labels_only + | |
labs(color = "Euclidean distance") + | |
NULL | |
``` | |
## Plot Mahalanobis Distance | |
```{r} | |
# calculate Mahalanobis distance | |
x.mah <- select(compare.cars[-1,], hwy, displ) %>% as.matrix | |
center.mah <- select(compare.cars[1,], hwy, displ) %>% as.matrix | |
compare.cars$mah.dist <- c(0, mahalanobis(x = x.mah, center = center.mah, cov = cov(rbind(center.mah, x.mah)))) | |
compare.cars$mah.dist.order <- (rank(compare.cars$mah.dist) - 1) %>% as.integer | |
# Mahalanobis distance plot | |
ggplot(data=compare.cars, aes(x = displ.standardized, y = hwy.standardized, color = mah.dist)) + | |
geom_point() + | |
geom_text(aes(label = mah.dist.order), size = 3, hjust=0, vjust=1.5) + | |
xlab("Displacement") + ylab("Gas mileage (highway)") + | |
axes_labels_only + | |
labs(color = "Mahalanobis distance") + | |
NULL | |
``` | |
## Mahalanobis:Euclid #1 | |
```{r} | |
# standardize metrics | |
compare.cars$mah.dist.standardized <- scale(compare.cars$mah.dist) %>% as.vector | |
compare.cars$euc.dist.standardized <- scale(compare.cars$euc.dist) %>% as.vector | |
# sapply(compare.cars, typeof) | |
# as.tibble(compare.cars) | |
# get distance from line | |
compare.cars <- as.tibble(compare.cars) %>% | |
mutate(mah.euc = (mah.dist.standardized-euc.dist.standardized)/sqrt(2)) # dot product to find distance from line | |
# compare distance metrics | |
ggplot(compare.cars, aes(x=euc.dist.standardized, mah.dist.standardized)) + | |
geom_point(aes(color = mah.euc)) + | |
xlab('Euclidean distance (standardized)') + | |
ylab('Mahalanobis distance (standardized)') + | |
scale_color_gradient2(name="Mahal. : Euclid.") + | |
geom_abline(intercept=0, slope=1) + | |
annotate("text", x=-0.5, y=1.25, label="Mahalanobis > Euclidean distance", color=scales::muted("blue")) + | |
annotate("text", x=1.5, y=-0.5, label="Euclidean > Mahalanobis distance", color=scales::muted("red")) + | |
NULL | |
``` | |
## Mahalanobis:Euclid #2 | |
```{r} | |
# compare distance metrics | |
ggplot(compare.cars, aes(x=displ, y=hwy)) + | |
geom_point(aes(color = mah.euc)) + | |
scale_color_gradient2() + | |
geom_point(data=my.car, color="red", size=2) + | |
geom_text(data=my.car, label="dream car!", color="red", vjust=-1) + | |
labs(color = "Mahal. : Euclid.") + | |
xlab("Displacement") + ylab("Gas mileage (highway)") + | |
NULL | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment