Ladsgroup · November 2, 2015 13:32
diff --git a/result b/result
 [0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0
 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0]
diff --git a/source_code.py b/source_code.py
 import codecs
 import math
 import sklearn.cluster
 import matplotlib.pyplot as plt
 from collections import defaultdict
 from scipy.stats import halfnorm

 x = set()
 c = 0
 path = '/home/amir/sigclust/enwiki_data/data2.tsv'

 with codecs.open(path, 'r', 'utf-8') as f:
    for line in f:
        line = line.replace('\n', '')
        features = []
        for feature in line.split('\t'):
            if feature == 'False':
                features.append(0)
            elif feature == 'True':
                features.append(1)
            else:
                features.append(float(feature))
        if features[-1] != 1:
            continue
        c += 1
        x.add(tuple(features[1:-1]))

 print(len(x))
 def mean_func(gen):
    mean = 0
    c = 0
    if not gen:
        return 0
    for case in gen:
        c += 1
        mean += case
    return mean/float(c)

 def std(gen, mean=None):
    if not gen:
        return 0
    if not mean:
        mean = mean_func(gen)
    variance = 0
    c = 0
    for case in gen:
        c += 1
        variance += (case - mean)**2
    return math.sqrt(variance / float(c))

 x_for_scaling = {}
 for case in x:
    for i in range(len(case)):
        x_for_scaling[i] = x_for_scaling.get(i, []) + [case[i]]

 mean_and_std = {}

 for i in x_for_scaling:
    mean = mean_func(x_for_scaling[i])
    std_var = std(x_for_scaling[i], mean)
    mean_and_std[i] = (mean, std_var)

 training_set = set()
 for case in x:
    new_case = []
    for i in range(len(case)):
        new_case.append((case[i] - mean_and_std[i][0])/mean_and_std[i][1])
    training_set.add(tuple(new_case))

 cost_function = {}
 res_for_plot = []
 for n in range(1, 12):
    classi = sklearn.cluster.KMeans(n_clusters=n)
    training_set = list(training_set)
    res = classi.fit_transform(training_set)
    cost_temp = 0
    dist = defaultdict(list)
    for i in range(len(res)):
        case = list(res[i])
        cost_temp += min(case)
        dist[case.index(min(case))].append(training_set[i])
    if n == 2:
        print(classi.labels_)
    cost_function[n] = cost_temp / len(res)
    res_for_plot.append(cost_function[n])
    ones = 0
    zeros = 0
    if n == 10:
        pass

 print(cost_function)
 for i in range(9):
    print(i+2,'-', i+1, ':', cost_function[i+2] - cost_function[i+1])
 plt.plot(list(range(1, 12)), res_for_plot)
 plt.ylabel('Cost function')
 plt.xlabel('Number of clusters')
 plt.title('Cost function per number of clusters in reverted edits in %s.wp' % path.split('/')[-1][:2])
 plt.show()
	[0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
	0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1
	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
	0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
	0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0
	0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0
	0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0
	1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
	0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0
	0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0
	0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0]
	import codecs
	import math
	import sklearn.cluster
	import matplotlib.pyplot as plt
	from collections import defaultdict
	from scipy.stats import halfnorm

	x = set()
	c = 0
	path = '/home/amir/sigclust/enwiki_data/data2.tsv'

	with codecs.open(path, 'r', 'utf-8') as f:
	for line in f:
	line = line.replace('\n', '')
	features = []
	for feature in line.split('\t'):
	if feature == 'False':
	features.append(0)
	elif feature == 'True':
	features.append(1)
	else:
	features.append(float(feature))
	if features[-1] != 1:
	continue
	c += 1
	x.add(tuple(features[1:-1]))

	print(len(x))
	def mean_func(gen):
	mean = 0
	c = 0
	if not gen:
	return 0
	for case in gen:
	c += 1
	mean += case
	return mean/float(c)

	def std(gen, mean=None):
	if not gen:
	return 0
	if not mean:
	mean = mean_func(gen)
	variance = 0
	c = 0
	for case in gen:
	c += 1
	variance += (case - mean)**2
	return math.sqrt(variance / float(c))

	x_for_scaling = {}
	for case in x:
	for i in range(len(case)):
	x_for_scaling[i] = x_for_scaling.get(i, []) + [case[i]]

	mean_and_std = {}

	for i in x_for_scaling:
	mean = mean_func(x_for_scaling[i])
	std_var = std(x_for_scaling[i], mean)
	mean_and_std[i] = (mean, std_var)

	training_set = set()
	for case in x:
	new_case = []
	for i in range(len(case)):
	new_case.append((case[i] - mean_and_std[i][0])/mean_and_std[i][1])
	training_set.add(tuple(new_case))

	cost_function = {}
	res_for_plot = []
	for n in range(1, 12):
	classi = sklearn.cluster.KMeans(n_clusters=n)
	training_set = list(training_set)
	res = classi.fit_transform(training_set)
	cost_temp = 0
	dist = defaultdict(list)
	for i in range(len(res)):
	case = list(res[i])
	cost_temp += min(case)
	dist[case.index(min(case))].append(training_set[i])
	if n == 2:
	print(classi.labels_)
	cost_function[n] = cost_temp / len(res)
	res_for_plot.append(cost_function[n])
	ones = 0
	zeros = 0
	if n == 10:
	pass

	print(cost_function)
	for i in range(9):
	print(i+2,'-', i+1, ':', cost_function[i+2] - cost_function[i+1])
	plt.plot(list(range(1, 12)), res_for_plot)
	plt.ylabel('Cost function')
	plt.xlabel('Number of clusters')
	plt.title('Cost function per number of clusters in reverted edits in %s.wp' % path.split('/')[-1][:2])
	plt.show()