Skip to content

Instantly share code, notes, and snippets.

@urigoren
Last active March 16, 2021 20:13
Show Gist options
  • Save urigoren/f073f1c9e19323ada795d2963aa698a2 to your computer and use it in GitHub Desktop.
Save urigoren/f073f1c9e19323ada795d2963aa698a2 to your computer and use it in GitHub Desktop.
import numpy as np
from scipy import sparse
import collections
"""
#SPARK Co-Occurence Matrix
#Format: (vertex, vertex) : count
import json, operator, itertools
def cooccur_matrix(srcHdfs, product2vertex):
ret = sc.textFile(srcHdfs)\
.map(json.loads) \
.map(lambda j: (j['userId'], product2vertex.value.get(j['productId'], -1))) \
.filter(lambda t: t[1] > 0)\
.map(lambda t: (t[0], set([t[1]])))\
.reduceByKey(operator.ior)\
.flatMap(lambda t: list(itertools.product(t[1],t[1])))\
.countByValue()
return ret
"""
def dict2graph(cooccur_matrix, threshold=2):
'''Returns a sparse matrix representation of the adjacency matrix'''
filtered = [(k[0],k[1]) for k,v in cooccur_matrix.items() if v>=threshold]
vertices=sorted(list(set([k for k,v in filtered])))
filtered = [(vertices.index(k),vertices.index(v)) for k,v in filtered]
row,col = zip(*filtered)
print ('We are left with {e} edges on {v} vertices'.format(e=len(col),v=len(vertices)))
return vertices, sparse.coo_matrix((np.ones(len(col)),(row,col)))
def collect_components(components,vertices):
ret = collections.defaultdict(list)
for i, c in enumerate(components):
ret[c].append(vertices[i])
return sorted(ret.values(),key=lambda x: len(x), reverse=True)
def connected_components(cooccur_matrix, edge_threshold, component_threshold=2):
'''
co-occurence matrix format dict((label_source,label_dest))-->count
edge_threshold = minumum count to consider as an edge
component_threshold = minumum number of vertices in a connected component
returns a list of components, sorted by size
'''
vertices, edges = dict2graph(cooccur_matrix, edge_threshold)
n, components = sparse.csgraph.connected_components(edges, directed=False)
print ('Found {n} components'.format(n=n))
components = collect_components(components,vertices)
components = [c for c in components if len(c)>=component_threshold]
print ('removed {k} small components'.format(k=n-len(components)))
print ('component sizes: '+ repr([len(c) for c in components]))
return components
if __name__ == '__main__':
import pickle
with open('cooccur_matrix.pickle','r') as f:
components = connected_components(pickle.load(f), edge_threshold=30, component_threshold=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment