|
# 参考 |
|
# http://langstat.blogspot.jp/2012/01/r.html |
|
|
|
# install.packages('proxy') |
|
library('proxy') |
|
|
|
summary(pr_DB) # show available proximities |
|
# * Similarity measures: |
|
# Braun-Blanquet, Chi-squared, correlation, cosine, Cramer, Dice, eJaccard, Fager, Faith, fJaccard, Gower, Hamman, Jaccard, |
|
# Kulczynski1, Kulczynski2, Michael, Mountford, Mozley, Ochiai, Pearson, Phi, Phi-squared, Russel, simple matching, Simpson, |
|
# Stiles, Tanimoto, Tschuprow, Yule, Yule2 |
|
# |
|
# * Distance measures: |
|
# Bhjattacharyya, Bray, Canberra, Chord, divergence, Euclidean, Geodesic, Hellinger, Kullback, Levenshtein, Mahalanobis, |
|
# Manhattan, Minkowski, Podani, Soergel, supremum, Wave, Whittaker |
|
|
|
pr_DB$get_entry("Jaccard") ### get more information about a particular one |
|
# names Jaccard, binary, Reyssac, Roux |
|
# FUN R_bjaccard |
|
# distance FALSE |
|
# PREFUN pr_Jaccard_prefun |
|
# POSTFUN NA |
|
# convert pr_simil2dist |
|
# type binary |
|
# loop FALSE |
|
# C_FUN TRUE |
|
# PACKAGE proxy |
|
# abcd FALSE |
|
# formula a / (a + b + c) |
|
# reference Jaccard, P. (1908). Nouvelles recherches sur la distribution florale. Bull. Soc. Vaud. Sci. Nat., 44, pp. |
|
# 223--270. |
|
# description The Jaccard Similarity (C implementation) for binary data. It is the proportion of (TRUE, TRUE) pairs, but not |
|
# considering (FALSE, FALSE) pairs. So it compares the intersection with the union of object sets. |
|
|
|
pr_DB$get_entry("cosine") |
|
pr_DB$get_entry("Simpson") |
|
|
|
dat<-iris[1:5,1:4] |
|
(dat) # irisデータセットの一部 |
|
# Sepal.Length Sepal.Width Petal.Length Petal.Width |
|
# 1 5.1 3.5 1.4 0.2 |
|
# 2 4.9 3.0 1.4 0.2 |
|
# 3 4.7 3.2 1.3 0.2 |
|
# 4 4.6 3.1 1.5 0.2 |
|
# 5 5.0 3.6 1.4 0.2 |
|
|
|
dist(dat) |
|
# 1 2 3 4 |
|
# 2 0.5385165 |
|
# 3 0.5099020 0.3000000 |
|
# 4 0.6480741 0.3316625 0.2449490 |
|
# 5 0.1414214 0.6082763 0.5099020 0.6480741 |
|
|
|
# コサイン類似度 |
|
simil(dat,method="cosine") |
|
# 1 2 3 4 |
|
# 2 0.9985792 |
|
# 3 0.9999873 0.9987915 |
|
# 4 0.9991006 0.9987939 0.9992170 |
|
# 5 0.9997577 0.9972408 0.9996681 0.9987170 |
|
|
|
# コサイン類似度の計算式 確認 |
|
# http://wikiwiki.jp/cattail/?%CE%E0%BB%F7%C5%D9%A4%C8%B5%F7%CE%A5 |
|
x<-c(2,3,1) |
|
y<-c(4,6,1) |
|
cos<-sum(x*y)/(sqrt(sum(x^2))*sqrt(sum(y^2))) |
|
(cos) |
|
# [1] 0.9912012 |
|
cos_simil<-function(x,y){ |
|
sum(x*y)/(sqrt(sum(x^2))*sqrt(sum(y^2))) |
|
# sum(x*y) は x%*%y とも書ける |
|
} |
|
|
|
a<-dat[,1] |
|
b<-dat[,2] |
|
cos_simil(a,b) # 微妙に違う。。計算式が違うのか? |
|
# [1] 0.9986185 |
|
|
|
|
|
# simil(dat,method="Jaccard") ### for binary data |
|
simil(dat,method="eJaccard") ### for real-valued data |
|
# 1 2 3 4 |
|
# 2 0.9923240 |
|
# 3 0.9930276 0.9973973 |
|
# 4 0.9886086 0.9967732 0.9982138 |
|
# 5 0.9995015 0.9901779 0.9929995 0.9885621 |
|
|
|
simil(dat,method="Simpson") |
|
# 1 2 3 4 |
|
# 2 1 |
|
# 3 1 1 |
|
# 4 1 1 1 |
|
# 5 1 1 1 1 |