yuu-ito · June 29, 2016 01:29
diff --git a/_共起尺度.md b/_共起尺度.md
diff --git a/proxy_package_test.R b/proxy_package_test.R
 # 参考
 # http://langstat.blogspot.jp/2012/01/r.html

 # install.packages('proxy')
 library('proxy')

 summary(pr_DB) # show available proximities
 # * Similarity measures:
 #   Braun-Blanquet, Chi-squared, correlation, cosine, Cramer, Dice, eJaccard, Fager, Faith, fJaccard, Gower, Hamman, Jaccard,
 # Kulczynski1, Kulczynski2, Michael, Mountford, Mozley, Ochiai, Pearson, Phi, Phi-squared, Russel, simple matching, Simpson,
 # Stiles, Tanimoto, Tschuprow, Yule, Yule2
 # 
 # * Distance measures:
 #   Bhjattacharyya, Bray, Canberra, Chord, divergence, Euclidean, Geodesic, Hellinger, Kullback, Levenshtein, Mahalanobis,
 # Manhattan, Minkowski, Podani, Soergel, supremum, Wave, Whittaker

 pr_DB$get_entry("Jaccard") ### get more information about a particular one
 #       names Jaccard, binary, Reyssac, Roux
 #         FUN R_bjaccard
 #    distance FALSE
 #      PREFUN pr_Jaccard_prefun
 #     POSTFUN NA
 #     convert pr_simil2dist
 #        type binary
 #        loop FALSE
 #       C_FUN TRUE
 #     PACKAGE proxy
 #        abcd FALSE
 #     formula a / (a + b + c)
 #   reference Jaccard, P. (1908). Nouvelles recherches sur la distribution florale. Bull. Soc. Vaud. Sci. Nat., 44, pp.
 #             223--270.
 # description The Jaccard Similarity (C implementation) for binary data. It is the proportion of (TRUE, TRUE) pairs, but not
 #             considering (FALSE, FALSE) pairs. So it compares the intersection with the union of object sets.

 pr_DB$get_entry("cosine")
 pr_DB$get_entry("Simpson")

 dat<-iris[1:5,1:4]
 (dat) # irisデータセットの一部
 #   Sepal.Length Sepal.Width Petal.Length Petal.Width
 # 1          5.1         3.5          1.4         0.2
 # 2          4.9         3.0          1.4         0.2
 # 3          4.7         3.2          1.3         0.2
 # 4          4.6         3.1          1.5         0.2
 # 5          5.0         3.6          1.4         0.2

 dist(dat)
 #           1         2         3         4
 # 2 0.5385165                              
 # 3 0.5099020 0.3000000                    
 # 4 0.6480741 0.3316625 0.2449490          
 # 5 0.1414214 0.6082763 0.5099020 0.6480741

 # コサイン類似度
 simil(dat,method="cosine")
 #           1         2         3         4
 # 2 0.9985792                              
 # 3 0.9999873 0.9987915                    
 # 4 0.9991006 0.9987939 0.9992170          
 # 5 0.9997577 0.9972408 0.9996681 0.9987170

 # コサイン類似度の計算式　確認
 # http://wikiwiki.jp/cattail/?%CE%E0%BB%F7%C5%D9%A4%C8%B5%F7%CE%A5
 x<-c(2,3,1)
 y<-c(4,6,1)
 cos<-sum(x*y)/(sqrt(sum(x^2))*sqrt(sum(y^2)))
 (cos)
 # [1] 0.9912012
 cos_simil<-function(x,y){
  sum(x*y)/(sqrt(sum(x^2))*sqrt(sum(y^2)))
  # sum(x*y) は x%*%y とも書ける
 }

 a<-dat[,1]
 b<-dat[,2]
 cos_simil(a,b) # 微妙に違う。。計算式が違うのか？
 # [1] 0.9986185


 # simil(dat,method="Jaccard") ### for binary data
 simil(dat,method="eJaccard") ### for real-valued data
 #           1         2         3         4
 # 2 0.9923240                              
 # 3 0.9930276 0.9973973                    
 # 4 0.9886086 0.9967732 0.9982138          
 # 5 0.9995015 0.9901779 0.9929995 0.9885621

 simil(dat,method="Simpson")
 #    1  2  3  4
 # 2  1         
 # 3  1  1      
 # 4  1  1  1   
 # 5  1  1  1  1
	# 参考
	# http://langstat.blogspot.jp/2012/01/r.html

	# install.packages('proxy')
	library('proxy')

	summary(pr_DB) # show available proximities
	# * Similarity measures:
	# Braun-Blanquet, Chi-squared, correlation, cosine, Cramer, Dice, eJaccard, Fager, Faith, fJaccard, Gower, Hamman, Jaccard,
	# Kulczynski1, Kulczynski2, Michael, Mountford, Mozley, Ochiai, Pearson, Phi, Phi-squared, Russel, simple matching, Simpson,
	# Stiles, Tanimoto, Tschuprow, Yule, Yule2
	#
	# * Distance measures:
	# Bhjattacharyya, Bray, Canberra, Chord, divergence, Euclidean, Geodesic, Hellinger, Kullback, Levenshtein, Mahalanobis,
	# Manhattan, Minkowski, Podani, Soergel, supremum, Wave, Whittaker

	pr_DB$get_entry("Jaccard") ### get more information about a particular one
	# names Jaccard, binary, Reyssac, Roux
	# FUN R_bjaccard
	# distance FALSE
	# PREFUN pr_Jaccard_prefun
	# POSTFUN NA
	# convert pr_simil2dist
	# type binary
	# loop FALSE
	# C_FUN TRUE
	# PACKAGE proxy
	# abcd FALSE
	# formula a / (a + b + c)
	# reference Jaccard, P. (1908). Nouvelles recherches sur la distribution florale. Bull. Soc. Vaud. Sci. Nat., 44, pp.
	# 223--270.
	# description The Jaccard Similarity (C implementation) for binary data. It is the proportion of (TRUE, TRUE) pairs, but not
	# considering (FALSE, FALSE) pairs. So it compares the intersection with the union of object sets.

	pr_DB$get_entry("cosine")
	pr_DB$get_entry("Simpson")

	dat<-iris[1:5,1:4]
	(dat) # irisデータセットの一部
	# Sepal.Length Sepal.Width Petal.Length Petal.Width
	# 1 5.1 3.5 1.4 0.2
	# 2 4.9 3.0 1.4 0.2
	# 3 4.7 3.2 1.3 0.2
	# 4 4.6 3.1 1.5 0.2
	# 5 5.0 3.6 1.4 0.2

	dist(dat)
	# 1 2 3 4
	# 2 0.5385165
	# 3 0.5099020 0.3000000
	# 4 0.6480741 0.3316625 0.2449490
	# 5 0.1414214 0.6082763 0.5099020 0.6480741

	# コサイン類似度
	simil(dat,method="cosine")
	# 1 2 3 4
	# 2 0.9985792
	# 3 0.9999873 0.9987915
	# 4 0.9991006 0.9987939 0.9992170
	# 5 0.9997577 0.9972408 0.9996681 0.9987170

	# コサイン類似度の計算式　確認
	# http://wikiwiki.jp/cattail/?%CE%E0%BB%F7%C5%D9%A4%C8%B5%F7%CE%A5
	x<-c(2,3,1)
	y<-c(4,6,1)
	cos<-sum(xy)/(sqrt(sum(x^2))sqrt(sum(y^2)))
	(cos)
	# [1] 0.9912012
	cos_simil<-function(x,y){
	sum(xy)/(sqrt(sum(x^2))sqrt(sum(y^2)))
	# sum(xy) は x%%y とも書ける
	}

	a<-dat[,1]
	b<-dat[,2]
	cos_simil(a,b) # 微妙に違う。。計算式が違うのか？
	# [1] 0.9986185


	# simil(dat,method="Jaccard") ### for binary data
	simil(dat,method="eJaccard") ### for real-valued data
	# 1 2 3 4
	# 2 0.9923240
	# 3 0.9930276 0.9973973
	# 4 0.9886086 0.9967732 0.9982138
	# 5 0.9995015 0.9901779 0.9929995 0.9885621

	simil(dat,method="Simpson")
	# 1 2 3 4
	# 2 1
	# 3 1 1
	# 4 1 1 1
	# 5 1 1 1 1