Skip to content

Instantly share code, notes, and snippets.

@cobanov
Created April 4, 2022 19:15
Show Gist options
  • Save cobanov/907abc867a4a7eba545fbef3f7754936 to your computer and use it in GitHub Desktop.
Save cobanov/907abc867a4a7eba545fbef3f7754936 to your computer and use it in GitHub Desktop.
import pandas as pd
import umap
import sys
import numpy as np
import os
def load_data(file_path, dimension=3):
# If file path extension is numpy extension, load it as numpy array
if file_path.endswith('.npy'):
data = np.load(file_path)
# If file path is csv file load with pandas
if file_path.endswith('.csv'):
data = pd.read_csv(file_path)
data = data.dropna()
print('Data has been read!')
return data
def dimension_reduction(data):
reducer = umap.UMAP(n_neighbors=10, n_components=6, min_dist=0.1, metric='sokalsneath', spread=18)
embedding = reducer.fit_transform(data)
print('Embeddings are created!')
return embedding
def write_csv(data, embedding, output_path):
dim6_df = pd.DataFrame(embedding, columns=['v1', 'v2', 'v3', 'v4', 'v5', 'v6'])
# dim6_df['label'] = data['words']
print('Dataframe is saving...')
dim6_df.to_csv(output_path, index=False)
# data = pd.read_csv(FILE_PATH)
#fit = umap.UMAP(n_neighbors=10, min_dist=0.1, n_components=6, metric='cosine', spread=10)
# u = fit.fit_transform(data.iloc[:, :-1])
# dim_red = pd.DataFrame(u, columns=['v1', 'v2', 'v3', 'v4', 'v5', 'v6'])
# if __name__ == '__main__':
# for file in os.listdir('./rumi-text-umaps'):
# if file.endswith('.csv'):
# data = load_data('./rumi-text-umaps/' + file)
# embedding = dimension_reduction(data)
# write_csv(data, embedding, file.replace('v2', 'v3'))
if __name__ == '__main__':
input_file = sys.argv[1]
output_file = sys.argv[2]
data = load_data(input_file)
embedding = dimension_reduction(data)
write_csv(data, embedding, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment