Skip to content

Instantly share code, notes, and snippets.

@poppingtonic
Created October 11, 2018 18:03
Show Gist options
  • Save poppingtonic/71ca2ea7420780c929498c0b53922fe3 to your computer and use it in GitHub Desktop.
Save poppingtonic/71ca2ea7420780c929498c0b53922fe3 to your computer and use it in GitHub Desktop.
import pandas as pd
from fastai.text import *
import html
LANG='fr'
def preprocess_wiki_lm(path):
DATA = Path(path)
trn_df = pd.read_csv(DATA/'train.csv', header=None)
val_df = pd.read_csv(DATA/'valid.csv', header=None)
trn_df['labels'] = [0]*len(trn_df)
val_df['labels'] = [0]*len(val_df)
df = trn_df[['labels', 0]]
df.to_csv(DATA/'lm_train.csv', index=False, header=None)
df = val_df[['labels', 0]]
df.to_csv(DATA/'lm_valid.csv', index=False, header=None)
data_lm = text_data_from_csv(PATH, tokenizer=Tokenizer(lang='fr'),
train='lm_train', valid='lm_valid', data_func=lm_data)
return data_lm
def train(data_lm):
learn = RNNLearner.language_model(data_lm, drop_mult=0.5)
learn.fit_one_cycle(3)
if __name__ == '__main__':
train(preprocess_wiki_lm(f'/path/to/data/wiki/{LANG}'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment