-
-
Save larsmans/4952848 to your computer and use it in GitHub Desktop.
#!/usr/bin/python | |
# | |
# K-means clustering using Lloyd's algorithm in pure Python. | |
# Written by Lars Buitinck. This code is in the public domain. | |
# | |
# The main program runs the clustering algorithm on a bunch of text documents | |
# specified as command-line arguments. These documents are first converted to | |
# sparse vectors, represented as lists of (index, value) pairs. | |
from collections import defaultdict | |
from math import sqrt | |
import random | |
def densify(x, n): | |
"""Convert a sparse vector to a dense one.""" | |
d = [0] * n | |
for i, v in x: | |
d[i] = v | |
return d | |
def dist(x, c): | |
"""Euclidean distance between sample x and cluster center c. | |
Inputs: x, a sparse vector | |
c, a dense vector | |
""" | |
sqdist = 0. | |
for i, v in x: | |
sqdist += (v - c[i]) ** 2 | |
return sqrt(sqdist) | |
def mean(xs, l): | |
"""Mean (as a dense vector) of a set of sparse vectors of length l.""" | |
c = [0.] * l | |
n = 0 | |
for x in xs: | |
for i, v in x: | |
c[i] += v | |
n += 1 | |
for i in xrange(l): | |
c[i] /= n | |
return c | |
def kmeans(k, xs, l, n_iter=10): | |
# Initialize from random points. | |
centers = [densify(xs[i], l) for i in random.sample(xrange(len(xs)), k)] | |
cluster = [None] * len(xs) | |
for _ in xrange(n_iter): | |
for i, x in enumerate(xs): | |
cluster[i] = min(xrange(k), key=lambda j: dist(xs[i], centers[j])) | |
for j, c in enumerate(centers): | |
members = (x for i, x in enumerate(xs) if cluster[i] == j) | |
centers[j] = mean(members, l) | |
return cluster | |
if __name__ == '__main__': | |
# Cluster a bunch of text documents. | |
import re | |
import sys | |
def usage(): | |
print("usage: %s k docs..." % sys.argv[0]) | |
print(" The number of documents must be >= k.") | |
sys.exit(1) | |
try: | |
k = int(sys.argv[1]) | |
except ValueError(): | |
usage() | |
if len(sys.argv) < 1 + k: | |
usage() | |
vocab = {} | |
xs = [] | |
args = sys.argv[2:] | |
for a in args: | |
x = defaultdict(float) | |
with open(a) as f: | |
for w in re.findall(r"\w+", f.read()): | |
vocab.setdefault(w, len(vocab)) | |
x[vocab[w]] += 1 | |
xs.append(x.items()) | |
cluster_ind = kmeans(k, xs, len(vocab)) | |
clusters = [set() for _ in xrange(k)] | |
for i, j in enumerate(cluster_ind): | |
clusters[j].add(i) | |
for j, c in enumerate(clusters): | |
print("cluster %d:" % j) | |
for i in c: | |
print("\t%s" % args[i]) |
@bobbyyit Looks like it wants the number of documents to cluster entered as the first argument, it will then loop as many times asking for the documents themselves.
No, the number of clusters k is the first argument.
Hello,
so from my understanding you need k documents? what if all the data is in one file?
how can i use it for a data like
documents = ["highly recommend series anyone yearning watch grown up television Complex characters plots keep one totally involved Thank Amazon Prime Mysteries interesting tension between Robson tall blond good always believable seemed uncomfortable beats watching blank screen do n't seem tune comedy today many episodes series pretty-much skip through try find description something think 's kind crap shoot whether 'll entertained hey 're sitting around trying kill 20-30 minutes 're waiting something 's worth shot"]
New to python so dnt knw exactly how to execute your code for my data. please help stuck at this step for long. its really urgent
pls send full source code to [email protected]
hello, i dont know what to put in the argument when running the function.
I tried:
python kmeans.py test.txt
help!