Created
November 4, 2011 11:31
-
-
Save conradlee/1339145 to your computer and use it in GitHub Desktop.
Edgelist parser using python and numpy's mmap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
import subprocess | |
weighted_edge_dtype = [("n1", numpy.uint32),("n2", numpy.uint32),("weight", numpy.float64)] | |
def convert_edgelist_to_mmap(in_filename): | |
# First determine number of edges because we will need to | |
# pre-allocate memmap object and that action requires a size | |
# Use unix's wc (WordCount) to count lines because it is | |
# quicker than python at parsing text files | |
wc_output = subprocess.check_output("wc -l " + in_filename, shell=True) | |
num_edges = int(wc_output.split()[0]) # wc_output looks like "numlines fname\n" | |
# Save num_edges somewhere---you will need this to load the mmap! | |
# Let's be clever and save it in the filename so we don't lose it | |
out_filename = in_filename.split(".")[0] + "_" + str(num_edges) + ".mmap_edgelist" | |
# Note: because we're using unsigned 32-bit integers, node | |
# ids must be positive and not exceed 4,294,967,295 | |
fp = numpy.memmap(out_filename, dtype=weighted_edge_dtype, mode="w+", shape=(num_edges,)) | |
with open(in_filename) as infile: | |
for i, line in enumerate(infile): | |
n1, n2, weight = line.rstrip("\n").split() | |
fp[i] = (int(n1), int(n2), float(weight)) | |
return out_filename | |
def get_mmap_edges(mmap_filename): | |
num_edges = int(mmap_filename.split(".")[-2].split("_")[-1]) | |
return numpy.memmap(mmap_filename, dtype=weighted_edge_dtype, mode="r", shape=(num_edges,)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment