Last active
January 12, 2021 09:53
-
-
Save howard-haowen/b7f0b1faf89fa16869e0e72099efa570 to your computer and use it in GitHub Desktop.
Some utility functions for the fastText library
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
# fasttext.train_unsupervised for word embeddings | |
def CreateTxt(df, text_col="text"): | |
# Each cell in "text_col" contains a string of a single tokenized document, with tokens seperated by spaces | |
df[text_col].to_csv('output.txt', sep='\n', index=False) | |
def CreateVecAndMeta(): | |
"""This function creates two tsv files, one for tokens and the other for token vectors. | |
The two files can then be used for visualizing embeddings at https://projector.tensorflow.org""" | |
# Vector file, `\t` seperates the vectors and `\n` seperates the words | |
""" | |
0.1\t0.2\t0.5\t0.9 | |
0.2\t0.1\t5.0\t0.2 | |
0.4\t0.1\t7.0\t0.8 | |
""" | |
out_vec = io.open('fasttext_vecs.tsv', 'w', encoding='utf-8') | |
# Meta data file, `\n` seperated word | |
""" | |
token1 | |
token2 | |
token3 | |
""" | |
out_meta = io.open('fasttext_meta.tsv', 'w', encoding='utf-8') | |
# Write meta file and vector file | |
for index in range(len(model.words)): | |
word = model.words[index] | |
vec = model.get_word_vector(word) | |
out_meta.write(word + "\n") | |
out_vec.write('\t'.join([str(x) for x in vec]) + "\n") | |
out_vec.close() | |
out_meta.close() | |
def ShowSimilarWords(ft_model, word_list): | |
for w in word_list: | |
res = ft_model.get_nearest_neighbors(w) | |
print(res) | |
###=====### | |
# fasttext.train_supervised for Text classification | |
def InsertLabelTag(df, label_col="label", text_col="text"): | |
df.loc[: ,label_col] = df[label_col].apply(lambda x: "__label__" + x) | |
return df | |
def SplitData(df, test_size=0.2): | |
train_df, test_df = train_test_split(df, test_size=test_size) | |
train_df.to_csv('train_data.txt', header=None, index=None, sep=' ') | |
test_df.to_csv('test_data.txt', header=None, index=None, sep=' ') | |
return train_df, test_df | |
def GetConfusionMatrix(test_df=test_df, text_col="text", fastText_model=model): | |
test_df["prediction"] = test_df[text_col].apply(lambda x: fastText_model.predict(x)[0][0]) | |
confusion_matrix = pd.crosstab(test_df['label'], test_df['prediction'], rownames=['Actual'], colnames=['Predicted']) | |
return confusion_matrix |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment