joshua-taylor · March 8, 2022 18:56 · bigfoot504 · Mar 8, 2022
diff --git a/fastText.py b/fastText.py
 from gensim.models.fasttext import FastText

 ft_model = FastText(
    sg=1, # use skip-gram: usually gives better results
    size=100, # embedding dimension (default)
    window=10, # window size: 10 tokens before and 10 tokens after to get wider context
    min_count=5, # only consider tokens with at least n occurrences in the corpus
    negative=15, # negative subsampling: bigger than default to sample negative examples more
    min_n=2, # min character n-gram
    max_n=5 # max character n-gram
 )
 ft_model.build_vocab(tok_text) # tok_text is our tokenized input text - a list of lists relating to docs and tokens respectivley

 ft_model.train(
    tok_text,
    epochs=6,
    total_examples=ft_model.corpus_count, 
    total_words=ft_model.corpus_total_words)

 ft_model.save('_fasttext.model') #save
 ft_model = FastText.load('_fasttext.model') #load
	from gensim.models.fasttext import FastText

	ft_model = FastText(
	sg=1, # use skip-gram: usually gives better results
	size=100, # embedding dimension (default)
	window=10, # window size: 10 tokens before and 10 tokens after to get wider context
	min_count=5, # only consider tokens with at least n occurrences in the corpus
	negative=15, # negative subsampling: bigger than default to sample negative examples more
	min_n=2, # min character n-gram
	max_n=5 # max character n-gram
	)
	ft_model.build_vocab(tok_text) # tok_text is our tokenized input text - a list of lists relating to docs and tokens respectivley

	ft_model.train(
	tok_text,
	epochs=6,
	total_examples=ft_model.corpus_count,
	total_words=ft_model.corpus_total_words)

	ft_model.save('_fasttext.model') #save
	ft_model = FastText.load('_fasttext.model') #load