Created
May 14, 2019 06:42
-
-
Save hamelsmu/df8c5c2f43e8affa5756ed291f8026ce to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pathlib import Path | |
from fastai.text import TextLMDataBunch as lmdb, load_data | |
from fastai.text.transform import Tokenizer | |
def pass_through(x): | |
return x | |
valid_df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/pre_processed_data/processed_part0000.csv') | |
train_df = valid_df | |
tokenizer = Tokenizer(pre_rules=[pass_through], n_cpus=30) | |
path = Path('lang_model_test/') | |
tst_data_lm = lmdb.from_df(path=path, | |
train_df=train_df, | |
valid_df=valid_df, | |
text_cols='text', | |
tokenizer=tokenizer, | |
chunksize=3000000) | |
path.mkdir(exist_ok=True) | |
tst_data_lm.save() | |
############################################ | |
### PART2: NOW, RESTART KERNEL and TRY THIS | |
############################################ | |
from fastai.text import TextLMDataBunch as lmdb, load_data | |
from pathlib import Path | |
path = Path('lang_model_test/') | |
tst_data_lm = load_data(path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment