Skip to content

Instantly share code, notes, and snippets.

@y-mitsui
Last active January 25, 2016 01:46
Show Gist options
  • Select an option

  • Save y-mitsui/0ea1317fe74eef14583b to your computer and use it in GitHub Desktop.

Select an option

Save y-mitsui/0ea1317fe74eef14583b to your computer and use it in GitHub Desktop.
多重コレスポンデンス分析ライブラリ(https://github.com/esafak/mca)の使用例
#! /usr/bin/env python
# -*- coding:utf-8 -*-
"""
多重コレスポンデンス分析ライブラリ(https://github.com/esafak/mca)の使用例
"""
import numpy as np
import numpy.random as random
import matplotlib.pyplot as plt
import pandas as pd
import mca
def sampling(num_sample):
# x:個体
# y1 = x + e, e ~ N(0,1)
# y2 = y1 + e, e ~ N(0,1)
# noise ~ N(0,1)
x = np.linspace(0,10,num_sample)
y1 = x + random.randn(num_sample)
y2 = y1 + random.randn(num_sample)
noise = random.randn(num_sample)
return (x,y1,y2,noise)
# sampling
x, y1, y2, noise = sampling(10000)
# dicsrite
sample = pd.DataFrame({"x":x, "y1":y1,"y2":y2, "noise":noise})
sample["x"] = sample["x"].apply(lambda row:0 if row < 3. else 1 if row < 6. else 2)
sample["y1"] = sample["y1"].apply(lambda row:int(row < 6.))
sample["y2"] = sample["y2"].apply(lambda row:int(row < 6.))
sample["noise"] = sample["noise"].apply(lambda row:int(row < 0.))
# 相互情報量を求める。必要であればこれを元に特徴選択をする
x_prob = sample.x.value_counts() / sample.x.shape[0]
print "================== Mutual Information of features =================="
for feature in ["y1","y2","noise"]:
cross_prob = pd.crosstab(sample.x, sample[feature]) / sample.x.shape[0]
feature_prob = sample[feature].value_counts() / sample[feature].shape[0]
mi = 0.
for x_idx in x_prob.index:
for feature_idx in feature_prob.index:
prob2d = 1e-5 if cross_prob.loc[x_idx,feature_idx]==0. else cross_prob.loc[x_idx,feature_idx]
mi += prob2d * np.log(prob2d / (x_prob[x_idx] * feature_prob[feature_idx]))
print "%s:%f"%(feature,mi)
print "===================================================================\n"
# make cross table
crossed = pd.DataFrame(columns=["y1_0","y1_1","y2_0","y2_1","noise_0","noise_1"])
for i in range(3):
row = [0] * 6
indivisual_sample = sample[sample.x==i]
stat = indivisual_sample.y1.value_counts()
row[0] = stat.get(0) if stat.get(0) != None else 0
row[1] = stat.get(1) if stat.get(1) != None else 0
stat = indivisual_sample.y2.value_counts()
row[2] = stat.get(0) if stat.get(0) != None else 0
row[3] = stat.get(1) if stat.get(1) != None else 0
stat = indivisual_sample.noise.value_counts()
row[4] = stat.get(0) if stat.get(0) != None else 0
row[5] = stat.get(1) if stat.get(1) != None else 0
df_addition_row = pd.DataFrame([row])
df_addition_row.columns =["y1_0","y1_1","y2_0","y2_1","noise_0","noise_1"]
df_addition_row.index =["indivisual" + str(i)]
crossed = pd.concat([crossed,df_addition_row],axis=0)
print "========================== Cross Table ============================"
print crossed
print "===================================================================\n"
# mca
mca_counts = mca.MCA(crossed,benzecri=False)
indivisuals = mca_counts.fs_r(N=2)
features = mca_counts.fs_c(N=2)
# plot. Be careful difference of x range and y range
# この例では実質的に第1軸目しか使われなかった為,第1軸目の距離だけに着目すれば良い
plt.scatter(indivisuals[:,0], indivisuals[:,1], c='g',marker='o')
labels = crossed.index
for label,x,y in zip(labels,indivisuals[:,0],indivisuals[:,1]):
plt.annotate(label,xy = (x, y))
plt.scatter(features[:,0], features[:,1], c='r',marker='^')
labels = crossed.columns
for label,x,y in zip(labels,features[:,0],features[:,1]):
plt.annotate(label,xy = (x, y))
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment