stestagg · March 28, 2018 12:29
diff --git a/pytubes1.py b/pytubes1.py
 import tubes

 FILES = glob.glob(path.expanduser("~/src/data/ngrams/1gram/googlebooks*"))
 WORD = "Python"

 # Set up the data load pipeline
 one_grams_tube = (tubes.Each(FILES)
    .read_files()
    .split()
    .tsv(headers=False)
    .multi(lambda row: (
        row.get(0).equals(WORD.encode('utf-8')),
        row.get(1).to(int),
        row.get(2).to(int)
    ))
 )

 # Load the data into a numpy array.  By setting a roughly-accurate 
 # estimated_rows count, pytubes optimizes the allocation pattern.  
 # fields=True here is redundant, but ensures that the returned ndarray
 # uses fields, rather than a single multidimentional array
 one_grams = one_grams_tube.ndarray(estimated_rows=500_000_000, fields=True)
	import tubes

	FILES = glob.glob(path.expanduser("~/src/data/ngrams/1gram/googlebooks*"))
	WORD = "Python"

	# Set up the data load pipeline
	one_grams_tube = (tubes.Each(FILES)
	.read_files()
	.split()
	.tsv(headers=False)
	.multi(lambda row: (
	row.get(0).equals(WORD.encode('utf-8')),
	row.get(1).to(int),
	row.get(2).to(int)
	))
	)

	# Load the data into a numpy array. By setting a roughly-accurate
	# estimated_rows count, pytubes optimizes the allocation pattern.
	# fields=True here is redundant, but ensures that the returned ndarray
	# uses fields, rather than a single multidimentional array
	one_grams = one_grams_tube.ndarray(estimated_rows=500_000_000, fields=True)