Last active
August 2, 2017 01:53
-
-
Save sente/c43b5e4216100f9cd87bd0e78c2c1323 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.cluster import MeanShift, estimate_bandwidth | |
import os | |
import stat | |
files = ['archives/2017-07/{}'.format(f) for f in list(os.walk('archives/2017-07/'))[0][2]] | |
ar = [] | |
arr = [] | |
for f in files: | |
ar.append([int(os.stat(f).st_mtime),f]) | |
for a,b in ar: | |
arr.append([a-ar[0][0],b]) | |
nums = [int(a[0]) for a in arr] | |
x=nums | |
X = np.array(list(zip(x,np.zeros(len(x)))), dtype=np.int) | |
bandwidth = estimate_bandwidth(X, quantile=0.05) | |
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) | |
ms.fit(X) | |
labels = ms.labels_ | |
cluster_centers = ms.cluster_centers_ | |
labels_unique = np.unique(labels) | |
n_clusters_ = len(labels_unique) | |
res = [] | |
for k in range(n_clusters_): | |
my_members = labels == k | |
print ( "cluster {0}: {1}".format(k, X[my_members, 0])) | |
res.append(X[my_members, 0].tolist()) | |
groups = {} | |
for i,row in enumerate(res): | |
groups[i]=[] | |
for r in row: | |
for tup in arr: | |
if tup[0] == r: | |
groups[i].append(tup) | |
for g,tups in groups.items(): | |
with open('{}.txt'.format(g),'w') as ofile: | |
for t in tups: | |
ofile.write('{}\n'.format(t[1])) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import numpy as np | |
import io as _io | |
import fileinput | |
import argparse | |
import itertools | |
import operator | |
import collections | |
from scipy.cluster.vq import kmeans, vq | |
from sklearn.cluster import MeanShift, estimate_bandwidth | |
def cluster2(nlist,num_groups): | |
y = np.array(nlist, dtype=float) | |
codebook, _ = kmeans(y, num_groups) # three clusters | |
cluster_indices, _ = vq(y, codebook) | |
groups = [] | |
#print (list(itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0)))) | |
res = [] | |
last_index = None | |
for idx,k in enumerate(cluster_indices): | |
if last_index != k: | |
res.append([]) | |
res[-1].append(nlist[idx]) | |
last_index=k | |
#for foo, bar in itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0)): | |
# res.append(list(bar)) | |
#res = [[bar[0],list(bar[1])] for bar in list(itertools.groupby(zip(cluster_indices.tolist(),nlist),operator.itemgetter(0))))] | |
print (res) | |
return res | |
# for a,b in zip(cluster_indices.tolist(),nlist): | |
# if lasta!=a: | |
# groups.app | |
# print (cluster_indices) | |
#return cluster_indices | |
def cluster(nlist, quantile): | |
nlist = [int(x) for x in nlist] | |
X = np.array(list(zip(nlist,np.zeros(len(nlist)))), dtype=np.int) | |
bandwidth = estimate_bandwidth(X, quantile=quantile) | |
print (bandwidth) | |
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) | |
ms.fit(X) | |
labels = ms.labels_ | |
cluster_centers = ms.cluster_centers_ | |
labels_unique = np.unique(labels) | |
n_clusters_ = len(labels_unique) | |
res = [] | |
for k in range(n_clusters_): | |
my_members = labels == k | |
print ( "cluster {0}: {1}".format(k, X[my_members, 0])) | |
res.append(X[my_members, 0].tolist()) | |
for i,r in enumerate(sorted(res)): | |
for el in r: | |
sys.stdout.write('{}\t{}\n'.format(i,el)) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--dummy', help='dummy argument') | |
parser.add_argument('-q', '--quantile', default='.1', dest='quantile', help='quantile') | |
parser.add_argument('-t', '--type', default='cluster', dest='type', help='type') | |
parser.add_argument('-n', '--num', default=5, dest='num_groups', help='num_groups') | |
parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used') | |
args = parser.parse_args() | |
print(args) | |
# If you would call fileinput.input() without files it would try to process all arguments. | |
# We pass '-' as only file when argparse got no files which will cause fileinput to read from stdin | |
ar = [] | |
for line in fileinput.input(files=args.files if len(args.files) > 0 else ('-', )): | |
ar.append(line.strip()) | |
if args.type == 'cluster': | |
cluster(ar,float(args.quantile)) | |
if args.type == 'cluster2': | |
cluster2(ar,int(args.num_groups)) | |
#parser = argparse.ArgumentParser() | |
#parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), | |
# default=sys.stdin) | |
#parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), | |
# default=sys.stdout) | |
#parser.parse_args(['input.txt', 'output.txt']) | |
#Namespace(infile=<_io.TextIOWrapper name='input.txt' encoding='UTF-8'>, | |
# outfile=<_io.TextIOWrapper name='output.txt' encoding='UTF-8'>) | |
#parser.parse_args([]) | |
#Namespace(infile=<_io.TextIOWrapper name='<stdin>' encoding='UTF-8'>, | |
# outfile=<_io.TextIOWrapper name='<stdout>' encoding='UTF-8'>) | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment