Skip to content

Instantly share code, notes, and snippets.

@hamelsmu
Created May 14, 2019 06:42
Show Gist options
  • Save hamelsmu/df8c5c2f43e8affa5756ed291f8026ce to your computer and use it in GitHub Desktop.
Save hamelsmu/df8c5c2f43e8affa5756ed291f8026ce to your computer and use it in GitHub Desktop.
import pandas as pd
from pathlib import Path
from fastai.text import TextLMDataBunch as lmdb, load_data
from fastai.text.transform import Tokenizer
def pass_through(x):
return x
valid_df = pd.read_csv('https://storage.googleapis.com/issue_label_bot/pre_processed_data/processed_part0000.csv')
train_df = valid_df
tokenizer = Tokenizer(pre_rules=[pass_through], n_cpus=30)
path = Path('lang_model_test/')
tst_data_lm = lmdb.from_df(path=path,
train_df=train_df,
valid_df=valid_df,
text_cols='text',
tokenizer=tokenizer,
chunksize=3000000)
path.mkdir(exist_ok=True)
tst_data_lm.save()
############################################
### PART2: NOW, RESTART KERNEL and TRY THIS
############################################
from fastai.text import TextLMDataBunch as lmdb, load_data
from pathlib import Path
path = Path('lang_model_test/')
tst_data_lm = load_data(path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment