nad2000 · October 29, 2013 07:47
diff --git a/data-mining-discovering-and.py b/data-mining-discovering-and.py
 #%% DATA IMPORTING AND VISUALIZATION

 from urllib2 import urlopen
 from contextlib import closing
 from os.path import basename


 def download_file(url, name=None):
    with closing(urlopen(url)) as u, open(name if name else basename(url), 'w') as f:
        f.write(u.read())

 download_file('http://aima.cs.berkeley.edu/data/iris.csv')
 download_file('http://networkdata.ics.uci.edu/data/lesmis/lesmis.gml')


 from numpy import genfromtxt, zeros
 # read the first 4 columns
 data = genfromtxt('iris.csv', delimiter=',', usecols=(0, 1, 2, 3))
 # read the fifth column
 target = genfromtxt('iris.csv', delimiter=',', usecols=(4,), dtype=str)

 print data.shape
 #(150, 4)
 print target.shape
 #(150,)

 print set(target)  # build a collection of unique elements
 # set(['setosa', 'versicolor', 'virginica'])


 #%% Scatter plot:
 from pylab import plot, show
 plot(data[target == 'setosa', 0], data[target == 'setosa', 2], 'bo')
 plot(data[target == 'versicolor', 0], data[target == 'versicolor', 2], 'ro')
 plot(data[target == 'virginica', 0], data[target == 'virginica', 2], 'go')
 show(block=False)


 #%% Histograms:
 from pylab import figure, subplot, hist, xlim, show
 xmin = min(data[:, 0])
 xmax = max(data[:, 0])
 figure()
 subplot(411)  # distribution of the setosa class (1st, on the top)
 hist(data[target == 'setosa', 0], color='b', alpha=.7)
 xlim(xmin, xmax)
 subplot(412)  # distribution of the versicolor class (2nd)
 hist(data[target == 'versicolor', 0], color='r', alpha=.7)
 xlim(xmin, xmax)
 subplot(413)  # distribution of the virginica class (3rd)
 hist(data[target == 'virginica', 0], color='g', alpha=.7)
 xlim(xmin, xmax)
 subplot(414)  # global histogram (4th, on the bottom)
 hist(data[:, 0], color='y', alpha=.7)
 xlim(xmin, xmax)
 show(block=False)

 #%% Classification:
 t = zeros(len(target))
 t[target == 'setosa'] = 1
 t[target == 'versicolor'] = 2
 t[target == 'virginica'] = 3

 from sklearn.naive_bayes import GaussianNB
 classifier = GaussianNB()
 classifier.fit(data, t)  # training on the iris dataset

 # verify one value:
 print classifier.predict(data[0]), t[0]
 # the whole lot:
 print classifier.predict(data) - t
 print classifier.score(data, t)

 #%% Verifying classifier:

 # it is important to evaluate the classifier on a wider range of samples and to
 # test it with data not used in the training process. To this end we split the
 # data into train set and test set, picking samples at random from the original
 # dataset. We will use the first set to train the classifier and the second one
 # to test the classifier.

 from sklearn import cross_validation
 train, test, t_train, t_test = cross_validation.train_test_split(
    data, t, test_size=0.4, random_state=0)

 classifier.fit(train, t_train)  # train
 print classifier.score(test, t_test)  # test
 # 0.933333333333 = 93.33% accuracy


 #%% Confusion Matrix:
 # Another tool to estimate the performance of a classifier is the confusion
 # matrix. In this matrix each column represents the instances in a predicted
 # class, while each row represents the instances in an actual class. Using the
 # module metrics it is pretty easy to compute and print the matrix:
 from sklearn.metrics import confusion_matrix
 print confusion_matrix(classifier.predict(test), t_test)
 # [[16  0  0]
 #  [ 0 23  4]
 #  [ 0  0 17]]

 # A function that gives us a complete report on the performance of the
 # classifier is also available:

 from sklearn.metrics import classification_report
 print classification_report(classifier.predict(test), t_test,
                            target_names=['setosa', 'versicolor', 'virginica'])

 # Precision: the proportion of the predicted positive cases that were correct
 # Recall (or also true positive rate): the proportion of positive
 #     cases that were correctly identified
 # F1-Score: the harmonic mean of precision and recall


 #%% Cross Validation:

 # The support is just the number of elements of the given class used for the
 # test. However, splitting the data, we reduce the number of samples that
 # can be used for the training, and the results of the evaluation may depend
 # on a particular random choice for the pair (train set, test set). To actually
 # evaluate a classifier and compare it with other ones, we have to use a more
 # sophisticated evaluation model like Cross Validation. The idea behind the
 # model is simple: the data is split into train and test sets several consecutive
 # times and the averaged value of the prediction scores obtained with the
 # different sets is the evaluation of the classifier. This time, sklearn provides
 # us a function to run the model:

 from sklearn.cross_validation import cross_val_score
 # cross validation with 6 iterations
 scores = cross_val_score(classifier, data, t, cv=6)
 print scores
 # [ 0.84  0.96  1.    1.    1.    0.96]

 from numpy import mean
 print mean(scores)
 # 0.96


 #%% CLUSTERING (unsupervised learning):

 from sklearn.cluster import KMeans
 # n_clusters = k:
 kmeans = KMeans(n_clusters=3, init='random')  # initialization
 kmeans.fit(data)  # actual execution

 c = kmeans.predict(data)
 from sklearn.metrics import completeness_score, homogeneity_score
 print completeness_score(t, c)
 print homogeneity_score(t, c)

 figure()
 subplot(211)  # top figure with the real classes
 plot(data[t == 1, 0], data[t == 1, 2], 'bo')
 plot(data[t == 2, 0], data[t == 2, 2], 'ro')
 plot(data[t == 3, 0], data[t == 3, 2], 'go')
 subplot(212)  # bottom figure with classes assigned automatically
 plot(data[c == 1, 0], data[c == 1, 2], 'bo', alpha=.7)
 plot(data[c == 2, 0], data[c == 2, 2], 'go', alpha=.7)
 plot(data[c == 0, 0], data[c == 0, 2], 'mo', alpha=.7)
 show(block=False)

 #%% REGRESSION:

 # In order to apply the linear regression we build a synthetic dataset
 # composed as described above:

 from numpy.random import rand, seed
 seed(0)

 x = rand(40, 1)  # explanatory variable
 y = x * x * x + rand(40, 1) / 5  # depentend variable

 from sklearn.linear_model import LinearRegression
 linreg = LinearRegression()
 linreg.fit(x, y)

 from numpy import linspace, matrix
 xx = linspace(0, 1, 40)
 figure()
 plot(x, y, 'o', xx, linreg.predict(matrix(xx).T), '--r')
 show(block=False)

 # Evaluate accuracy:
 from sklearn.metrics import mean_squared_error
 print mean_squared_error(linreg.predict(x), y)

 #%% CORRELATION:

 from numpy import corrcoef
 corr = corrcoef(data.T)  # .T gives the transpose
 print corr

 from pylab import pcolor, colorbar, xticks, yticks
 from numpy import arange

 figure()
 pcolor(corr)
 colorbar()  # add
 # arranging the names of the variables on the axis
 xticks(
    arange(0.5,
           4.5),
    ['sepal length',
     'sepal width',
     'petal length',
     'petal width'],
    rotation=-20)
 yticks(
    arange(0.5,
           4.5),
    ['sepal length',
     'sepal width',
     'petal length',
     'petal width'],
    rotation=-20)
 show(block=False)

 #%% DIMENSIONALITY REDUCTION:
 # using Principal Component Analysis (PCA)

 from sklearn.decomposition import PCA
 pca = PCA(n_components=2)

 pcad = pca.fit_transform(data)

 figure()
 plot(pcad[target == 'setosa', 0], pcad[target == 'setosa', 1], 'bo')
 plot(pcad[target == 'versicolor', 0], pcad[target == 'versicolor', 1], 'ro')
 plot(pcad[target == 'virginica', 0], pcad[target == 'virginica', 1], 'go')
 show(block=False)

 # The PCA projects the data into a space where the variance is maximized and
 # we can determine how much information is stored in the PCs looking at
 # the variance ratio:
 print pca.explained_variance_ratio_

 # We can also print how much information we lost during the transformation
 # process:
 print 1 - sum(pca.explained_variance_ratio_)

 # data_inv = pca.inverse_transform(pcad):
 data_inv = pca.inverse_transform(pcad)

 # Data loss:
 print abs(sum(sum(data - data_inv)))

 # Try with different number of components:
 for i in range(1, 5):
    pca = PCA(n_components=i)
    pca.fit(data)
    print sum(pca.explained_variance_ratio_) * 100, '%'


 #%% NETWORKS MINING:
 import networkx as nx
 G = nx.read_gml('lesmis.gml', relabel=True)
 figure()
 nx.draw(G, node_size=0, edge_color='b', alpha=.2, font_size=7)

 deg = nx.degree(G)
 from numpy import percentile, mean, median
 print min(deg.values())
 print percentile(deg.values(), 25)  # computes the 1st quartile
 print median(deg.values())
 print percentile(deg.values(), 75)  # computes the 3rd quartile
 print max(deg.values())

 # the nodes with a degree higher than 10:
 Gt = G.copy()
 dn = nx.degree(Gt)
 for n in Gt.nodes():
    if dn[n] <= 10:
        Gt.remove_node(n)

 figure()
 nx.draw(Gt, node_size=0, edge_color='b', alpha=.2, font_size=12)

 show(block=False)

 #%% Finding Cliques:
 #A clique is a group where a node is connected to all the other ones

 from networkx import find_cliques
 cliques = list(find_cliques(G))

 print max(cliques, key=lambda l: len(l))
	#%% DATA IMPORTING AND VISUALIZATION

	from urllib2 import urlopen
	from contextlib import closing
	from os.path import basename


	def download_file(url, name=None):
	with closing(urlopen(url)) as u, open(name if name else basename(url), 'w') as f:
	f.write(u.read())

	download_file('http://aima.cs.berkeley.edu/data/iris.csv')
	download_file('http://networkdata.ics.uci.edu/data/lesmis/lesmis.gml')


	from numpy import genfromtxt, zeros
	# read the first 4 columns
	data = genfromtxt('iris.csv', delimiter=',', usecols=(0, 1, 2, 3))
	# read the fifth column
	target = genfromtxt('iris.csv', delimiter=',', usecols=(4,), dtype=str)

	print data.shape
	#(150, 4)
	print target.shape
	#(150,)

	print set(target) # build a collection of unique elements
	# set(['setosa', 'versicolor', 'virginica'])


	#%% Scatter plot:
	from pylab import plot, show
	plot(data[target == 'setosa', 0], data[target == 'setosa', 2], 'bo')
	plot(data[target == 'versicolor', 0], data[target == 'versicolor', 2], 'ro')
	plot(data[target == 'virginica', 0], data[target == 'virginica', 2], 'go')
	show(block=False)


	#%% Histograms:
	from pylab import figure, subplot, hist, xlim, show
	xmin = min(data[:, 0])
	xmax = max(data[:, 0])
	figure()
	subplot(411) # distribution of the setosa class (1st, on the top)
	hist(data[target == 'setosa', 0], color='b', alpha=.7)
	xlim(xmin, xmax)
	subplot(412) # distribution of the versicolor class (2nd)
	hist(data[target == 'versicolor', 0], color='r', alpha=.7)
	xlim(xmin, xmax)
	subplot(413) # distribution of the virginica class (3rd)
	hist(data[target == 'virginica', 0], color='g', alpha=.7)
	xlim(xmin, xmax)
	subplot(414) # global histogram (4th, on the bottom)
	hist(data[:, 0], color='y', alpha=.7)
	xlim(xmin, xmax)
	show(block=False)

	#%% Classification:
	t = zeros(len(target))
	t[target == 'setosa'] = 1
	t[target == 'versicolor'] = 2
	t[target == 'virginica'] = 3

	from sklearn.naive_bayes import GaussianNB
	classifier = GaussianNB()
	classifier.fit(data, t) # training on the iris dataset

	# verify one value:
	print classifier.predict(data[0]), t[0]
	# the whole lot:
	print classifier.predict(data) - t
	print classifier.score(data, t)

	#%% Verifying classifier:

	# it is important to evaluate the classifier on a wider range of samples and to
	# test it with data not used in the training process. To this end we split the
	# data into train set and test set, picking samples at random from the original
	# dataset. We will use the first set to train the classifier and the second one
	# to test the classifier.

	from sklearn import cross_validation
	train, test, t_train, t_test = cross_validation.train_test_split(
	data, t, test_size=0.4, random_state=0)

	classifier.fit(train, t_train) # train
	print classifier.score(test, t_test) # test
	# 0.933333333333 = 93.33% accuracy


	#%% Confusion Matrix:
	# Another tool to estimate the performance of a classifier is the confusion
	# matrix. In this matrix each column represents the instances in a predicted
	# class, while each row represents the instances in an actual class. Using the
	# module metrics it is pretty easy to compute and print the matrix:
	from sklearn.metrics import confusion_matrix
	print confusion_matrix(classifier.predict(test), t_test)
	# [[16 0 0]
	# [ 0 23 4]
	# [ 0 0 17]]

	# A function that gives us a complete report on the performance of the
	# classifier is also available:

	from sklearn.metrics import classification_report
	print classification_report(classifier.predict(test), t_test,
	target_names=['setosa', 'versicolor', 'virginica'])

	# Precision: the proportion of the predicted positive cases that were correct
	# Recall (or also true positive rate): the proportion of positive
	# cases that were correctly identified
	# F1-Score: the harmonic mean of precision and recall


	#%% Cross Validation:

	# The support is just the number of elements of the given class used for the
	# test. However, splitting the data, we reduce the number of samples that
	# can be used for the training, and the results of the evaluation may depend
	# on a particular random choice for the pair (train set, test set). To actually
	# evaluate a classifier and compare it with other ones, we have to use a more
	# sophisticated evaluation model like Cross Validation. The idea behind the
	# model is simple: the data is split into train and test sets several consecutive
	# times and the averaged value of the prediction scores obtained with the
	# different sets is the evaluation of the classifier. This time, sklearn provides
	# us a function to run the model:

	from sklearn.cross_validation import cross_val_score
	# cross validation with 6 iterations
	scores = cross_val_score(classifier, data, t, cv=6)
	print scores
	# [ 0.84 0.96 1. 1. 1. 0.96]

	from numpy import mean
	print mean(scores)
	# 0.96


	#%% CLUSTERING (unsupervised learning):

	from sklearn.cluster import KMeans
	# n_clusters = k:
	kmeans = KMeans(n_clusters=3, init='random') # initialization
	kmeans.fit(data) # actual execution

	c = kmeans.predict(data)
	from sklearn.metrics import completeness_score, homogeneity_score
	print completeness_score(t, c)
	print homogeneity_score(t, c)

	figure()
	subplot(211) # top figure with the real classes
	plot(data[t == 1, 0], data[t == 1, 2], 'bo')
	plot(data[t == 2, 0], data[t == 2, 2], 'ro')
	plot(data[t == 3, 0], data[t == 3, 2], 'go')
	subplot(212) # bottom figure with classes assigned automatically
	plot(data[c == 1, 0], data[c == 1, 2], 'bo', alpha=.7)
	plot(data[c == 2, 0], data[c == 2, 2], 'go', alpha=.7)
	plot(data[c == 0, 0], data[c == 0, 2], 'mo', alpha=.7)
	show(block=False)

	#%% REGRESSION:

	# In order to apply the linear regression we build a synthetic dataset
	# composed as described above:

	from numpy.random import rand, seed
	seed(0)

	x = rand(40, 1) # explanatory variable
	y = x * x * x + rand(40, 1) / 5 # depentend variable

	from sklearn.linear_model import LinearRegression
	linreg = LinearRegression()
	linreg.fit(x, y)

	from numpy import linspace, matrix
	xx = linspace(0, 1, 40)
	figure()
	plot(x, y, 'o', xx, linreg.predict(matrix(xx).T), '--r')
	show(block=False)

	# Evaluate accuracy:
	from sklearn.metrics import mean_squared_error
	print mean_squared_error(linreg.predict(x), y)

	#%% CORRELATION:

	from numpy import corrcoef
	corr = corrcoef(data.T) # .T gives the transpose
	print corr

	from pylab import pcolor, colorbar, xticks, yticks
	from numpy import arange

	figure()
	pcolor(corr)
	colorbar() # add
	# arranging the names of the variables on the axis
	xticks(
	arange(0.5,
	4.5),
	['sepal length',
	'sepal width',
	'petal length',
	'petal width'],
	rotation=-20)
	yticks(
	arange(0.5,
	4.5),
	['sepal length',
	'sepal width',
	'petal length',
	'petal width'],
	rotation=-20)
	show(block=False)

	#%% DIMENSIONALITY REDUCTION:
	# using Principal Component Analysis (PCA)

	from sklearn.decomposition import PCA
	pca = PCA(n_components=2)

	pcad = pca.fit_transform(data)

	figure()
	plot(pcad[target == 'setosa', 0], pcad[target == 'setosa', 1], 'bo')
	plot(pcad[target == 'versicolor', 0], pcad[target == 'versicolor', 1], 'ro')
	plot(pcad[target == 'virginica', 0], pcad[target == 'virginica', 1], 'go')
	show(block=False)

	# The PCA projects the data into a space where the variance is maximized and
	# we can determine how much information is stored in the PCs looking at
	# the variance ratio:
	print pca.explained_variance_ratio_

	# We can also print how much information we lost during the transformation
	# process:
	print 1 - sum(pca.explained_variance_ratio_)

	# data_inv = pca.inverse_transform(pcad):
	data_inv = pca.inverse_transform(pcad)

	# Data loss:
	print abs(sum(sum(data - data_inv)))

	# Try with different number of components:
	for i in range(1, 5):
	pca = PCA(n_components=i)
	pca.fit(data)
	print sum(pca.explained_variance_ratio_) * 100, '%'


	#%% NETWORKS MINING:
	import networkx as nx
	G = nx.read_gml('lesmis.gml', relabel=True)
	figure()
	nx.draw(G, node_size=0, edge_color='b', alpha=.2, font_size=7)

	deg = nx.degree(G)
	from numpy import percentile, mean, median
	print min(deg.values())
	print percentile(deg.values(), 25) # computes the 1st quartile
	print median(deg.values())
	print percentile(deg.values(), 75) # computes the 3rd quartile
	print max(deg.values())

	# the nodes with a degree higher than 10:
	Gt = G.copy()
	dn = nx.degree(Gt)
	for n in Gt.nodes():
	if dn[n] <= 10:
	Gt.remove_node(n)

	figure()
	nx.draw(Gt, node_size=0, edge_color='b', alpha=.2, font_size=12)

	show(block=False)

	#%% Finding Cliques:
	#A clique is a group where a node is connected to all the other ones

	from networkx import find_cliques
	cliques = list(find_cliques(G))

	print max(cliques, key=lambda l: len(l))