howard-haowen · January 12, 2021 09:53
diff --git a/PrepareDataForFasttext.py b/PrepareDataForFasttext.py
 import io
 import pandas as pd
 from sklearn.model_selection import train_test_split

 # fasttext.train_unsupervised for word embeddings
 def CreateTxt(df, text_col="text"):
  # Each cell in "text_col" contains a string of a single tokenized document, with tokens seperated by spaces  
  df[text_col].to_csv('output.txt', sep='\n', index=False)

 def CreateVecAndMeta():
  """This function creates two tsv files, one for tokens and the other for token vectors. 
  The two files can then be used for visualizing embeddings at https://projector.tensorflow.org"""
  # Vector file, `\t` seperates the vectors and `\n` seperates the words
 """
 0.1\t0.2\t0.5\t0.9
 0.2\t0.1\t5.0\t0.2
 0.4\t0.1\t7.0\t0.8
 """
  out_vec = io.open('fasttext_vecs.tsv', 'w', encoding='utf-8')
  
  # Meta data file, `\n` seperated word
 """
 token1
 token2
 token3
 """
  out_meta = io.open('fasttext_meta.tsv', 'w', encoding='utf-8')
  
  # Write meta file and vector file
  for index in range(len(model.words)):
    word = model.words[index]
    vec = model.get_word_vector(word)
    out_meta.write(word + "\n")
    out_vec.write('\t'.join([str(x) for x in vec]) + "\n")
  out_vec.close()
  out_meta.close()

 def ShowSimilarWords(ft_model, word_list):
  for w in word_list:
    res = ft_model.get_nearest_neighbors(w)
    print(res)

 ###=====###

 # fasttext.train_supervised for Text classification
 def InsertLabelTag(df, label_col="label", text_col="text"):
  df.loc[: ,label_col] = df[label_col].apply(lambda x: "__label__" + x)
  return df

 def SplitData(df, test_size=0.2):
  train_df, test_df = train_test_split(df, test_size=test_size)
  train_df.to_csv('train_data.txt', header=None, index=None, sep=' ')
  test_df.to_csv('test_data.txt', header=None, index=None, sep=' ')
  return train_df, test_df

 def GetConfusionMatrix(test_df=test_df, text_col="text", fastText_model=model):
  test_df["prediction"] = test_df[text_col].apply(lambda x: fastText_model.predict(x)[0][0])
  confusion_matrix = pd.crosstab(test_df['label'], test_df['prediction'], rownames=['Actual'], colnames=['Predicted'])
  return confusion_matrix
	import io
	import pandas as pd
	from sklearn.model_selection import train_test_split

	# fasttext.train_unsupervised for word embeddings
	def CreateTxt(df, text_col="text"):
	# Each cell in "text_col" contains a string of a single tokenized document, with tokens seperated by spaces
	df[text_col].to_csv('output.txt', sep='\n', index=False)

	def CreateVecAndMeta():
	"""This function creates two tsv files, one for tokens and the other for token vectors.
	The two files can then be used for visualizing embeddings at https://projector.tensorflow.org"""
	# Vector file, `\t` seperates the vectors and `\n` seperates the words
	"""
	0.1\t0.2\t0.5\t0.9
	0.2\t0.1\t5.0\t0.2
	0.4\t0.1\t7.0\t0.8
	"""
	out_vec = io.open('fasttext_vecs.tsv', 'w', encoding='utf-8')

	# Meta data file, `\n` seperated word
	"""
	token1
	token2
	token3
	"""
	out_meta = io.open('fasttext_meta.tsv', 'w', encoding='utf-8')

	# Write meta file and vector file
	for index in range(len(model.words)):
	word = model.words[index]
	vec = model.get_word_vector(word)
	out_meta.write(word + "\n")
	out_vec.write('\t'.join([str(x) for x in vec]) + "\n")
	out_vec.close()
	out_meta.close()

	def ShowSimilarWords(ft_model, word_list):
	for w in word_list:
	res = ft_model.get_nearest_neighbors(w)
	print(res)

	###=====###

	# fasttext.train_supervised for Text classification
	def InsertLabelTag(df, label_col="label", text_col="text"):
	df.loc[: ,label_col] = df[label_col].apply(lambda x: "__label__" + x)
	return df

	def SplitData(df, test_size=0.2):
	train_df, test_df = train_test_split(df, test_size=test_size)
	train_df.to_csv('train_data.txt', header=None, index=None, sep=' ')
	test_df.to_csv('test_data.txt', header=None, index=None, sep=' ')
	return train_df, test_df

	def GetConfusionMatrix(test_df=test_df, text_col="text", fastText_model=model):
	test_df["prediction"] = test_df[text_col].apply(lambda x: fastText_model.predict(x)[0][0])
	confusion_matrix = pd.crosstab(test_df['label'], test_df['prediction'], rownames=['Actual'], colnames=['Predicted'])
	return confusion_matrix