Skip to content

Instantly share code, notes, and snippets.

@nad2000
Created October 29, 2013 07:47
Show Gist options
  • Save nad2000/7210531 to your computer and use it in GitHub Desktop.
Save nad2000/7210531 to your computer and use it in GitHub Desktop.
#%% DATA IMPORTING AND VISUALIZATION
from urllib2 import urlopen
from contextlib import closing
from os.path import basename
def download_file(url, name=None):
with closing(urlopen(url)) as u, open(name if name else basename(url), 'w') as f:
f.write(u.read())
download_file('http://aima.cs.berkeley.edu/data/iris.csv')
download_file('http://networkdata.ics.uci.edu/data/lesmis/lesmis.gml')
from numpy import genfromtxt, zeros
# read the first 4 columns
data = genfromtxt('iris.csv', delimiter=',', usecols=(0, 1, 2, 3))
# read the fifth column
target = genfromtxt('iris.csv', delimiter=',', usecols=(4,), dtype=str)
print data.shape
#(150, 4)
print target.shape
#(150,)
print set(target) # build a collection of unique elements
# set(['setosa', 'versicolor', 'virginica'])
#%% Scatter plot:
from pylab import plot, show
plot(data[target == 'setosa', 0], data[target == 'setosa', 2], 'bo')
plot(data[target == 'versicolor', 0], data[target == 'versicolor', 2], 'ro')
plot(data[target == 'virginica', 0], data[target == 'virginica', 2], 'go')
show(block=False)
#%% Histograms:
from pylab import figure, subplot, hist, xlim, show
xmin = min(data[:, 0])
xmax = max(data[:, 0])
figure()
subplot(411) # distribution of the setosa class (1st, on the top)
hist(data[target == 'setosa', 0], color='b', alpha=.7)
xlim(xmin, xmax)
subplot(412) # distribution of the versicolor class (2nd)
hist(data[target == 'versicolor', 0], color='r', alpha=.7)
xlim(xmin, xmax)
subplot(413) # distribution of the virginica class (3rd)
hist(data[target == 'virginica', 0], color='g', alpha=.7)
xlim(xmin, xmax)
subplot(414) # global histogram (4th, on the bottom)
hist(data[:, 0], color='y', alpha=.7)
xlim(xmin, xmax)
show(block=False)
#%% Classification:
t = zeros(len(target))
t[target == 'setosa'] = 1
t[target == 'versicolor'] = 2
t[target == 'virginica'] = 3
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(data, t) # training on the iris dataset
# verify one value:
print classifier.predict(data[0]), t[0]
# the whole lot:
print classifier.predict(data) - t
print classifier.score(data, t)
#%% Verifying classifier:
# it is important to evaluate the classifier on a wider range of samples and to
# test it with data not used in the training process. To this end we split the
# data into train set and test set, picking samples at random from the original
# dataset. We will use the first set to train the classifier and the second one
# to test the classifier.
from sklearn import cross_validation
train, test, t_train, t_test = cross_validation.train_test_split(
data, t, test_size=0.4, random_state=0)
classifier.fit(train, t_train) # train
print classifier.score(test, t_test) # test
# 0.933333333333 = 93.33% accuracy
#%% Confusion Matrix:
# Another tool to estimate the performance of a classifier is the confusion
# matrix. In this matrix each column represents the instances in a predicted
# class, while each row represents the instances in an actual class. Using the
# module metrics it is pretty easy to compute and print the matrix:
from sklearn.metrics import confusion_matrix
print confusion_matrix(classifier.predict(test), t_test)
# [[16 0 0]
# [ 0 23 4]
# [ 0 0 17]]
# A function that gives us a complete report on the performance of the
# classifier is also available:
from sklearn.metrics import classification_report
print classification_report(classifier.predict(test), t_test,
target_names=['setosa', 'versicolor', 'virginica'])
# Precision: the proportion of the predicted positive cases that were correct
# Recall (or also true positive rate): the proportion of positive
# cases that were correctly identified
# F1-Score: the harmonic mean of precision and recall
#%% Cross Validation:
# The support is just the number of elements of the given class used for the
# test. However, splitting the data, we reduce the number of samples that
# can be used for the training, and the results of the evaluation may depend
# on a particular random choice for the pair (train set, test set). To actually
# evaluate a classifier and compare it with other ones, we have to use a more
# sophisticated evaluation model like Cross Validation. The idea behind the
# model is simple: the data is split into train and test sets several consecutive
# times and the averaged value of the prediction scores obtained with the
# different sets is the evaluation of the classifier. This time, sklearn provides
# us a function to run the model:
from sklearn.cross_validation import cross_val_score
# cross validation with 6 iterations
scores = cross_val_score(classifier, data, t, cv=6)
print scores
# [ 0.84 0.96 1. 1. 1. 0.96]
from numpy import mean
print mean(scores)
# 0.96
#%% CLUSTERING (unsupervised learning):
from sklearn.cluster import KMeans
# n_clusters = k:
kmeans = KMeans(n_clusters=3, init='random') # initialization
kmeans.fit(data) # actual execution
c = kmeans.predict(data)
from sklearn.metrics import completeness_score, homogeneity_score
print completeness_score(t, c)
print homogeneity_score(t, c)
figure()
subplot(211) # top figure with the real classes
plot(data[t == 1, 0], data[t == 1, 2], 'bo')
plot(data[t == 2, 0], data[t == 2, 2], 'ro')
plot(data[t == 3, 0], data[t == 3, 2], 'go')
subplot(212) # bottom figure with classes assigned automatically
plot(data[c == 1, 0], data[c == 1, 2], 'bo', alpha=.7)
plot(data[c == 2, 0], data[c == 2, 2], 'go', alpha=.7)
plot(data[c == 0, 0], data[c == 0, 2], 'mo', alpha=.7)
show(block=False)
#%% REGRESSION:
# In order to apply the linear regression we build a synthetic dataset
# composed as described above:
from numpy.random import rand, seed
seed(0)
x = rand(40, 1) # explanatory variable
y = x * x * x + rand(40, 1) / 5 # depentend variable
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(x, y)
from numpy import linspace, matrix
xx = linspace(0, 1, 40)
figure()
plot(x, y, 'o', xx, linreg.predict(matrix(xx).T), '--r')
show(block=False)
# Evaluate accuracy:
from sklearn.metrics import mean_squared_error
print mean_squared_error(linreg.predict(x), y)
#%% CORRELATION:
from numpy import corrcoef
corr = corrcoef(data.T) # .T gives the transpose
print corr
from pylab import pcolor, colorbar, xticks, yticks
from numpy import arange
figure()
pcolor(corr)
colorbar() # add
# arranging the names of the variables on the axis
xticks(
arange(0.5,
4.5),
['sepal length',
'sepal width',
'petal length',
'petal width'],
rotation=-20)
yticks(
arange(0.5,
4.5),
['sepal length',
'sepal width',
'petal length',
'petal width'],
rotation=-20)
show(block=False)
#%% DIMENSIONALITY REDUCTION:
# using Principal Component Analysis (PCA)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pcad = pca.fit_transform(data)
figure()
plot(pcad[target == 'setosa', 0], pcad[target == 'setosa', 1], 'bo')
plot(pcad[target == 'versicolor', 0], pcad[target == 'versicolor', 1], 'ro')
plot(pcad[target == 'virginica', 0], pcad[target == 'virginica', 1], 'go')
show(block=False)
# The PCA projects the data into a space where the variance is maximized and
# we can determine how much information is stored in the PCs looking at
# the variance ratio:
print pca.explained_variance_ratio_
# We can also print how much information we lost during the transformation
# process:
print 1 - sum(pca.explained_variance_ratio_)
# data_inv = pca.inverse_transform(pcad):
data_inv = pca.inverse_transform(pcad)
# Data loss:
print abs(sum(sum(data - data_inv)))
# Try with different number of components:
for i in range(1, 5):
pca = PCA(n_components=i)
pca.fit(data)
print sum(pca.explained_variance_ratio_) * 100, '%'
#%% NETWORKS MINING:
import networkx as nx
G = nx.read_gml('lesmis.gml', relabel=True)
figure()
nx.draw(G, node_size=0, edge_color='b', alpha=.2, font_size=7)
deg = nx.degree(G)
from numpy import percentile, mean, median
print min(deg.values())
print percentile(deg.values(), 25) # computes the 1st quartile
print median(deg.values())
print percentile(deg.values(), 75) # computes the 3rd quartile
print max(deg.values())
# the nodes with a degree higher than 10:
Gt = G.copy()
dn = nx.degree(Gt)
for n in Gt.nodes():
if dn[n] <= 10:
Gt.remove_node(n)
figure()
nx.draw(Gt, node_size=0, edge_color='b', alpha=.2, font_size=12)
show(block=False)
#%% Finding Cliques:
#A clique is a group where a node is connected to all the other ones
from networkx import find_cliques
cliques = list(find_cliques(G))
print max(cliques, key=lambda l: len(l))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment