Last active
May 10, 2018 18:51
-
-
Save flaviovdf/c74f597cdfd9db787cd670d55bbecf26 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 | |
from collections import defaultdict | |
import gzip | |
def get_graph_stamps(path, top=None): | |
count = defaultdict(int) | |
srcs = set() | |
with gzip.open(path, 'r') as in_file: | |
for line in in_file: | |
if b',' in line: | |
spl = line.split(b',') | |
else: | |
spl = line.split() | |
src, dst = spl[:2] | |
count[dst] += 1 | |
srcs.add(src) | |
if top is None: | |
valid = srcs | |
else: | |
valid = set() | |
for v, k in sorted(((v, k) for k, v in count.items()), reverse=True): | |
if k in srcs: | |
valid.add(k) | |
if len(valid) == top: | |
break | |
graph = {} | |
ids = {} | |
with gzip.open(path, 'r') as in_file: | |
timestamps = [] | |
for line in in_file: | |
if b',' in line: | |
spl = line.split(b',') | |
else: | |
spl = line.split() | |
src, dst = spl[:2] | |
stamp = float(spl[-1]) | |
if src not in valid: | |
continue | |
if dst not in valid: | |
continue | |
if src not in graph: | |
graph[src] = {} | |
if dst not in graph[src]: | |
graph[src][dst] = 0 | |
graph[src][dst] += 1 | |
if dst in ids: | |
timestamps[ids[dst]].append(stamp) | |
else: | |
ids[dst] = len(timestamps) | |
timestamps.append([stamp]) | |
for id_ in list(graph.keys()): | |
if id_ not in ids: | |
del graph[id_] | |
for id_ in ids: | |
if id_ not in graph: | |
graph[id_] = {} | |
return timestamps, graph, ids | |
if __name__ == '__main__': | |
path = sys.argv[1] | |
timestamps, graph, ids = get_graph_stamps(path) | |
vals = [] | |
rows = [] | |
cols = [] | |
for src in graph: | |
for dst in graph[src]: | |
if src not in ids or dst not in ids: | |
continue | |
rows.append(ids[src]) | |
cols.append(ids[dst]) | |
vals.append(graph[src][dst]) | |
GT = sp.csr_matrix((vals, (rows, cols)), dtype='d') | |
n = sum(len(t) for t in timestamps) | |
print(len(timestamps)) | |
print(len(graph)) | |
print(len(ids)) | |
print(n) | |
print(GT.nnz / (GT.shape[0] * GT.shape[1])) | |
print((GT.sum(axis=1) == 0).sum() / (GT.shape[1])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment