Created
October 29, 2013 07:47
-
-
Save nad2000/7210531 to your computer and use it in GitHub Desktop.
Fixed code from http://refcardz.dzone.com/refcardz/data-mining-discovering-and
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#%% DATA IMPORTING AND VISUALIZATION | |
from urllib2 import urlopen | |
from contextlib import closing | |
from os.path import basename | |
def download_file(url, name=None): | |
with closing(urlopen(url)) as u, open(name if name else basename(url), 'w') as f: | |
f.write(u.read()) | |
download_file('http://aima.cs.berkeley.edu/data/iris.csv') | |
download_file('http://networkdata.ics.uci.edu/data/lesmis/lesmis.gml') | |
from numpy import genfromtxt, zeros | |
# read the first 4 columns | |
data = genfromtxt('iris.csv', delimiter=',', usecols=(0, 1, 2, 3)) | |
# read the fifth column | |
target = genfromtxt('iris.csv', delimiter=',', usecols=(4,), dtype=str) | |
print data.shape | |
#(150, 4) | |
print target.shape | |
#(150,) | |
print set(target) # build a collection of unique elements | |
# set(['setosa', 'versicolor', 'virginica']) | |
#%% Scatter plot: | |
from pylab import plot, show | |
plot(data[target == 'setosa', 0], data[target == 'setosa', 2], 'bo') | |
plot(data[target == 'versicolor', 0], data[target == 'versicolor', 2], 'ro') | |
plot(data[target == 'virginica', 0], data[target == 'virginica', 2], 'go') | |
show(block=False) | |
#%% Histograms: | |
from pylab import figure, subplot, hist, xlim, show | |
xmin = min(data[:, 0]) | |
xmax = max(data[:, 0]) | |
figure() | |
subplot(411) # distribution of the setosa class (1st, on the top) | |
hist(data[target == 'setosa', 0], color='b', alpha=.7) | |
xlim(xmin, xmax) | |
subplot(412) # distribution of the versicolor class (2nd) | |
hist(data[target == 'versicolor', 0], color='r', alpha=.7) | |
xlim(xmin, xmax) | |
subplot(413) # distribution of the virginica class (3rd) | |
hist(data[target == 'virginica', 0], color='g', alpha=.7) | |
xlim(xmin, xmax) | |
subplot(414) # global histogram (4th, on the bottom) | |
hist(data[:, 0], color='y', alpha=.7) | |
xlim(xmin, xmax) | |
show(block=False) | |
#%% Classification: | |
t = zeros(len(target)) | |
t[target == 'setosa'] = 1 | |
t[target == 'versicolor'] = 2 | |
t[target == 'virginica'] = 3 | |
from sklearn.naive_bayes import GaussianNB | |
classifier = GaussianNB() | |
classifier.fit(data, t) # training on the iris dataset | |
# verify one value: | |
print classifier.predict(data[0]), t[0] | |
# the whole lot: | |
print classifier.predict(data) - t | |
print classifier.score(data, t) | |
#%% Verifying classifier: | |
# it is important to evaluate the classifier on a wider range of samples and to | |
# test it with data not used in the training process. To this end we split the | |
# data into train set and test set, picking samples at random from the original | |
# dataset. We will use the first set to train the classifier and the second one | |
# to test the classifier. | |
from sklearn import cross_validation | |
train, test, t_train, t_test = cross_validation.train_test_split( | |
data, t, test_size=0.4, random_state=0) | |
classifier.fit(train, t_train) # train | |
print classifier.score(test, t_test) # test | |
# 0.933333333333 = 93.33% accuracy | |
#%% Confusion Matrix: | |
# Another tool to estimate the performance of a classifier is the confusion | |
# matrix. In this matrix each column represents the instances in a predicted | |
# class, while each row represents the instances in an actual class. Using the | |
# module metrics it is pretty easy to compute and print the matrix: | |
from sklearn.metrics import confusion_matrix | |
print confusion_matrix(classifier.predict(test), t_test) | |
# [[16 0 0] | |
# [ 0 23 4] | |
# [ 0 0 17]] | |
# A function that gives us a complete report on the performance of the | |
# classifier is also available: | |
from sklearn.metrics import classification_report | |
print classification_report(classifier.predict(test), t_test, | |
target_names=['setosa', 'versicolor', 'virginica']) | |
# Precision: the proportion of the predicted positive cases that were correct | |
# Recall (or also true positive rate): the proportion of positive | |
# cases that were correctly identified | |
# F1-Score: the harmonic mean of precision and recall | |
#%% Cross Validation: | |
# The support is just the number of elements of the given class used for the | |
# test. However, splitting the data, we reduce the number of samples that | |
# can be used for the training, and the results of the evaluation may depend | |
# on a particular random choice for the pair (train set, test set). To actually | |
# evaluate a classifier and compare it with other ones, we have to use a more | |
# sophisticated evaluation model like Cross Validation. The idea behind the | |
# model is simple: the data is split into train and test sets several consecutive | |
# times and the averaged value of the prediction scores obtained with the | |
# different sets is the evaluation of the classifier. This time, sklearn provides | |
# us a function to run the model: | |
from sklearn.cross_validation import cross_val_score | |
# cross validation with 6 iterations | |
scores = cross_val_score(classifier, data, t, cv=6) | |
print scores | |
# [ 0.84 0.96 1. 1. 1. 0.96] | |
from numpy import mean | |
print mean(scores) | |
# 0.96 | |
#%% CLUSTERING (unsupervised learning): | |
from sklearn.cluster import KMeans | |
# n_clusters = k: | |
kmeans = KMeans(n_clusters=3, init='random') # initialization | |
kmeans.fit(data) # actual execution | |
c = kmeans.predict(data) | |
from sklearn.metrics import completeness_score, homogeneity_score | |
print completeness_score(t, c) | |
print homogeneity_score(t, c) | |
figure() | |
subplot(211) # top figure with the real classes | |
plot(data[t == 1, 0], data[t == 1, 2], 'bo') | |
plot(data[t == 2, 0], data[t == 2, 2], 'ro') | |
plot(data[t == 3, 0], data[t == 3, 2], 'go') | |
subplot(212) # bottom figure with classes assigned automatically | |
plot(data[c == 1, 0], data[c == 1, 2], 'bo', alpha=.7) | |
plot(data[c == 2, 0], data[c == 2, 2], 'go', alpha=.7) | |
plot(data[c == 0, 0], data[c == 0, 2], 'mo', alpha=.7) | |
show(block=False) | |
#%% REGRESSION: | |
# In order to apply the linear regression we build a synthetic dataset | |
# composed as described above: | |
from numpy.random import rand, seed | |
seed(0) | |
x = rand(40, 1) # explanatory variable | |
y = x * x * x + rand(40, 1) / 5 # depentend variable | |
from sklearn.linear_model import LinearRegression | |
linreg = LinearRegression() | |
linreg.fit(x, y) | |
from numpy import linspace, matrix | |
xx = linspace(0, 1, 40) | |
figure() | |
plot(x, y, 'o', xx, linreg.predict(matrix(xx).T), '--r') | |
show(block=False) | |
# Evaluate accuracy: | |
from sklearn.metrics import mean_squared_error | |
print mean_squared_error(linreg.predict(x), y) | |
#%% CORRELATION: | |
from numpy import corrcoef | |
corr = corrcoef(data.T) # .T gives the transpose | |
print corr | |
from pylab import pcolor, colorbar, xticks, yticks | |
from numpy import arange | |
figure() | |
pcolor(corr) | |
colorbar() # add | |
# arranging the names of the variables on the axis | |
xticks( | |
arange(0.5, | |
4.5), | |
['sepal length', | |
'sepal width', | |
'petal length', | |
'petal width'], | |
rotation=-20) | |
yticks( | |
arange(0.5, | |
4.5), | |
['sepal length', | |
'sepal width', | |
'petal length', | |
'petal width'], | |
rotation=-20) | |
show(block=False) | |
#%% DIMENSIONALITY REDUCTION: | |
# using Principal Component Analysis (PCA) | |
from sklearn.decomposition import PCA | |
pca = PCA(n_components=2) | |
pcad = pca.fit_transform(data) | |
figure() | |
plot(pcad[target == 'setosa', 0], pcad[target == 'setosa', 1], 'bo') | |
plot(pcad[target == 'versicolor', 0], pcad[target == 'versicolor', 1], 'ro') | |
plot(pcad[target == 'virginica', 0], pcad[target == 'virginica', 1], 'go') | |
show(block=False) | |
# The PCA projects the data into a space where the variance is maximized and | |
# we can determine how much information is stored in the PCs looking at | |
# the variance ratio: | |
print pca.explained_variance_ratio_ | |
# We can also print how much information we lost during the transformation | |
# process: | |
print 1 - sum(pca.explained_variance_ratio_) | |
# data_inv = pca.inverse_transform(pcad): | |
data_inv = pca.inverse_transform(pcad) | |
# Data loss: | |
print abs(sum(sum(data - data_inv))) | |
# Try with different number of components: | |
for i in range(1, 5): | |
pca = PCA(n_components=i) | |
pca.fit(data) | |
print sum(pca.explained_variance_ratio_) * 100, '%' | |
#%% NETWORKS MINING: | |
import networkx as nx | |
G = nx.read_gml('lesmis.gml', relabel=True) | |
figure() | |
nx.draw(G, node_size=0, edge_color='b', alpha=.2, font_size=7) | |
deg = nx.degree(G) | |
from numpy import percentile, mean, median | |
print min(deg.values()) | |
print percentile(deg.values(), 25) # computes the 1st quartile | |
print median(deg.values()) | |
print percentile(deg.values(), 75) # computes the 3rd quartile | |
print max(deg.values()) | |
# the nodes with a degree higher than 10: | |
Gt = G.copy() | |
dn = nx.degree(Gt) | |
for n in Gt.nodes(): | |
if dn[n] <= 10: | |
Gt.remove_node(n) | |
figure() | |
nx.draw(Gt, node_size=0, edge_color='b', alpha=.2, font_size=12) | |
show(block=False) | |
#%% Finding Cliques: | |
#A clique is a group where a node is connected to all the other ones | |
from networkx import find_cliques | |
cliques = list(find_cliques(G)) | |
print max(cliques, key=lambda l: len(l)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment