Last active
March 28, 2018 12:29
-
-
Save stestagg/e7f80e715fb55b5ad6b15050baa93f8e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tubes | |
FILES = glob.glob(path.expanduser("~/src/data/ngrams/1gram/googlebooks*")) | |
WORD = "Python" | |
# Set up the data load pipeline | |
one_grams_tube = (tubes.Each(FILES) | |
.read_files() | |
.split() | |
.tsv(headers=False) | |
.multi(lambda row: ( | |
row.get(0).equals(WORD.encode('utf-8')), | |
row.get(1).to(int), | |
row.get(2).to(int) | |
)) | |
) | |
# Load the data into a numpy array. By setting a roughly-accurate | |
# estimated_rows count, pytubes optimizes the allocation pattern. | |
# fields=True here is redundant, but ensures that the returned ndarray | |
# uses fields, rather than a single multidimentional array | |
one_grams = one_grams_tube.ndarray(estimated_rows=500_000_000, fields=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment