Skip to content

Instantly share code, notes, and snippets.

@howard-haowen
Last active January 12, 2021 09:53
Show Gist options
  • Save howard-haowen/b7f0b1faf89fa16869e0e72099efa570 to your computer and use it in GitHub Desktop.
Save howard-haowen/b7f0b1faf89fa16869e0e72099efa570 to your computer and use it in GitHub Desktop.
Some utility functions for the fastText library
import io
import pandas as pd
from sklearn.model_selection import train_test_split
# fasttext.train_unsupervised for word embeddings
def CreateTxt(df, text_col="text"):
# Each cell in "text_col" contains a string of a single tokenized document, with tokens seperated by spaces
df[text_col].to_csv('output.txt', sep='\n', index=False)
def CreateVecAndMeta():
"""This function creates two tsv files, one for tokens and the other for token vectors.
The two files can then be used for visualizing embeddings at https://projector.tensorflow.org"""
# Vector file, `\t` seperates the vectors and `\n` seperates the words
"""
0.1\t0.2\t0.5\t0.9
0.2\t0.1\t5.0\t0.2
0.4\t0.1\t7.0\t0.8
"""
out_vec = io.open('fasttext_vecs.tsv', 'w', encoding='utf-8')
# Meta data file, `\n` seperated word
"""
token1
token2
token3
"""
out_meta = io.open('fasttext_meta.tsv', 'w', encoding='utf-8')
# Write meta file and vector file
for index in range(len(model.words)):
word = model.words[index]
vec = model.get_word_vector(word)
out_meta.write(word + "\n")
out_vec.write('\t'.join([str(x) for x in vec]) + "\n")
out_vec.close()
out_meta.close()
def ShowSimilarWords(ft_model, word_list):
for w in word_list:
res = ft_model.get_nearest_neighbors(w)
print(res)
###=====###
# fasttext.train_supervised for Text classification
def InsertLabelTag(df, label_col="label", text_col="text"):
df.loc[: ,label_col] = df[label_col].apply(lambda x: "__label__" + x)
return df
def SplitData(df, test_size=0.2):
train_df, test_df = train_test_split(df, test_size=test_size)
train_df.to_csv('train_data.txt', header=None, index=None, sep=' ')
test_df.to_csv('test_data.txt', header=None, index=None, sep=' ')
return train_df, test_df
def GetConfusionMatrix(test_df=test_df, text_col="text", fastText_model=model):
test_df["prediction"] = test_df[text_col].apply(lambda x: fastText_model.predict(x)[0][0])
confusion_matrix = pd.crosstab(test_df['label'], test_df['prediction'], rownames=['Actual'], colnames=['Predicted'])
return confusion_matrix
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment