sente · August 2, 2017 01:53
diff --git a/clustering.py b/clustering.py
 import numpy as np
 from sklearn.cluster import MeanShift, estimate_bandwidth
 import os
 import stat

 files = ['archives/2017-07/{}'.format(f) for f in list(os.walk('archives/2017-07/'))[0][2]]

 ar = []
 arr = []

 for f in files:
    ar.append([int(os.stat(f).st_mtime),f])
 for a,b in ar:
    arr.append([a-ar[0][0],b])

 nums = [int(a[0]) for a in arr]
 x=nums
 X = np.array(list(zip(x,np.zeros(len(x)))), dtype=np.int)

 bandwidth = estimate_bandwidth(X, quantile=0.05)

 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
 ms.fit(X)
 labels = ms.labels_
 cluster_centers = ms.cluster_centers_
 labels_unique = np.unique(labels)
 n_clusters_ = len(labels_unique)

 res = []
 for k in range(n_clusters_):
    my_members = labels == k
    print ( "cluster {0}: {1}".format(k, X[my_members, 0]))
    res.append(X[my_members, 0].tolist())

 groups = {}
 for i,row in enumerate(res):
    groups[i]=[]
    for r in row:
        for tup in arr:
            if tup[0] == r:
                groups[i].append(tup)


 for g,tups in groups.items():
    with open('{}.txt'.format(g),'w') as ofile:
        for t in tups:
            ofile.write('{}\n'.format(t[1]))

diff --git a/clusteringcli.py b/clusteringcli.py
 import sys
 import numpy as np
 import io as _io
 import fileinput
 import argparse
 import itertools
 import operator
 import collections
 from scipy.cluster.vq import kmeans, vq
 from sklearn.cluster import MeanShift, estimate_bandwidth



 def cluster2(nlist,num_groups):
    y = np.array(nlist, dtype=float)
    codebook, _ = kmeans(y, num_groups)  # three clusters
    cluster_indices, _ = vq(y, codebook)
    groups = []
    #print (list(itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0))))
    res = []
    last_index = None
    for idx,k in enumerate(cluster_indices):
        if last_index != k:
            res.append([])
        res[-1].append(nlist[idx])
        last_index=k

    #for foo, bar in itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0)):
    #    res.append(list(bar))
    #res = [[bar[0],list(bar[1])] for bar in list(itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0))))]
    print (res)
    return res
 #    for a,b in zip(cluster_indices.tolist(),nlist):
 #        if lasta!=a:
 #            groups.app
 #    print (cluster_indices)
    #return cluster_indices

 def cluster(nlist, quantile):
    nlist = [int(x) for x in nlist]
    X = np.array(list(zip(nlist,np.zeros(len(nlist)))), dtype=np.int)

    bandwidth = estimate_bandwidth(X, quantile=quantile)

    print (bandwidth)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)




    res = []
    for k in range(n_clusters_):
        my_members = labels == k
        print ( "cluster {0}: {1}".format(k, X[my_members, 0]))
        res.append(X[my_members, 0].tolist())

    for i,r in enumerate(sorted(res)):
        for el in r:
            sys.stdout.write('{}\t{}\n'.format(i,el))




 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--dummy', help='dummy argument')
    parser.add_argument('-q', '--quantile', default='.1', dest='quantile', help='quantile')
    parser.add_argument('-t', '--type', default='cluster', dest='type', help='type')
    parser.add_argument('-n', '--num', default=5, dest='num_groups', help='num_groups')
    parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
    args = parser.parse_args()
    print(args)
    # If you would call fileinput.input() without files it would try to process all arguments.
    # We pass '-' as only file when argparse got no files which will cause fileinput to read from stdin
    ar = []
    for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
        ar.append(line.strip())

    if args.type == 'cluster':
        cluster(ar,float(args.quantile))
    if args.type == 'cluster2':
        cluster2(ar,int(args.num_groups))

 #parser = argparse.ArgumentParser()
 #parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
 #                    default=sys.stdin)
 #parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
 #                    default=sys.stdout)
 #parser.parse_args(['input.txt', 'output.txt'])
 #Namespace(infile=<_io.TextIOWrapper name='input.txt' encoding='UTF-8'>,
 #          outfile=<_io.TextIOWrapper name='output.txt' encoding='UTF-8'>)
 #parser.parse_args([])
 #Namespace(infile=<_io.TextIOWrapper name='<stdin>' encoding='UTF-8'>,
 #          outfile=<_io.TextIOWrapper name='<stdout>' encoding='UTF-8'>)
 #
	import numpy as np
	from sklearn.cluster import MeanShift, estimate_bandwidth
	import os
	import stat

	files = ['archives/2017-07/{}'.format(f) for f in list(os.walk('archives/2017-07/'))[0][2]]

	ar = []
	arr = []

	for f in files:
	ar.append([int(os.stat(f).st_mtime),f])
	for a,b in ar:
	arr.append([a-ar[0][0],b])

	nums = [int(a[0]) for a in arr]
	x=nums
	X = np.array(list(zip(x,np.zeros(len(x)))), dtype=np.int)

	bandwidth = estimate_bandwidth(X, quantile=0.05)

	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
	ms.fit(X)
	labels = ms.labels_
	cluster_centers = ms.cluster_centers_
	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)

	res = []
	for k in range(n_clusters_):
	my_members = labels == k
	print ( "cluster {0}: {1}".format(k, X[my_members, 0]))
	res.append(X[my_members, 0].tolist())

	groups = {}
	for i,row in enumerate(res):
	groups[i]=[]
	for r in row:
	for tup in arr:
	if tup[0] == r:
	groups[i].append(tup)


	for g,tups in groups.items():
	with open('{}.txt'.format(g),'w') as ofile:
	for t in tups:
	ofile.write('{}\n'.format(t[1]))
	import sys
	import numpy as np
	import io as _io
	import fileinput
	import argparse
	import itertools
	import operator
	import collections
	from scipy.cluster.vq import kmeans, vq
	from sklearn.cluster import MeanShift, estimate_bandwidth



	def cluster2(nlist,num_groups):
	y = np.array(nlist, dtype=float)
	codebook, _ = kmeans(y, num_groups) # three clusters
	cluster_indices, _ = vq(y, codebook)
	groups = []
	#print (list(itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0))))
	res = []
	last_index = None
	for idx,k in enumerate(cluster_indices):
	if last_index != k:
	res.append([])
	res[-1].append(nlist[idx])
	last_index=k

	#for foo, bar in itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0)):
	# res.append(list(bar))
	#res = [[bar[0],list(bar[1])] for bar in list(itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0))))]
	print (res)
	return res
	# for a,b in zip(cluster_indices.tolist(),nlist):
	# if lasta!=a:
	# groups.app
	# print (cluster_indices)
	#return cluster_indices

	def cluster(nlist, quantile):
	nlist = [int(x) for x in nlist]
	X = np.array(list(zip(nlist,np.zeros(len(nlist)))), dtype=np.int)

	bandwidth = estimate_bandwidth(X, quantile=quantile)

	print (bandwidth)
	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
	ms.fit(X)
	labels = ms.labels_
	cluster_centers = ms.cluster_centers_
	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)




	res = []
	for k in range(n_clusters_):
	my_members = labels == k
	print ( "cluster {0}: {1}".format(k, X[my_members, 0]))
	res.append(X[my_members, 0].tolist())

	for i,r in enumerate(sorted(res)):
	for el in r:
	sys.stdout.write('{}\t{}\n'.format(i,el))




	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--dummy', help='dummy argument')
	parser.add_argument('-q', '--quantile', default='.1', dest='quantile', help='quantile')
	parser.add_argument('-t', '--type', default='cluster', dest='type', help='type')
	parser.add_argument('-n', '--num', default=5, dest='num_groups', help='num_groups')
	parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
	args = parser.parse_args()
	print(args)
	# If you would call fileinput.input() without files it would try to process all arguments.
	# We pass '-' as only file when argparse got no files which will cause fileinput to read from stdin
	ar = []
	for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )):
	ar.append(line.strip())

	if args.type == 'cluster':
	cluster(ar,float(args.quantile))
	if args.type == 'cluster2':
	cluster2(ar,int(args.num_groups))

	#parser = argparse.ArgumentParser()
	#parser.add_argument('infile', nargs='?', type=argparse.FileType('r'),
	# default=sys.stdin)
	#parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'),
	# default=sys.stdout)
	#parser.parse_args(['input.txt', 'output.txt'])
	#Namespace(infile=<_io.TextIOWrapper name='input.txt' encoding='UTF-8'>,
	# outfile=<_io.TextIOWrapper name='output.txt' encoding='UTF-8'>)
	#parser.parse_args([])
	#Namespace(infile=<_io.TextIOWrapper name='<stdin>' encoding='UTF-8'>,
	# outfile=<_io.TextIOWrapper name='<stdout>' encoding='UTF-8'>)
	#