Skip to content

Instantly share code, notes, and snippets.

@cobanov
Created January 18, 2023 10:20
Show Gist options
  • Save cobanov/535997f902584eaf8a454c4132f52670 to your computer and use it in GitHub Desktop.
Save cobanov/535997f902584eaf8a454c4132f52670 to your computer and use it in GitHub Desktop.
from cuml.decomposition import PCA
import pandas as pd
import numpy as np
import cupy
import os
# GPU_ID = 1
# cupy.cuda.Device(GPU_ID).use()
INPUT_PATH = "/mnt/datauniverse/../.."
PCA_DIMSIZE = 256
def read_embeddings(INPUT_PATH, extension):
if extension == ".npy":
data = np.load(INPUT_PATH, allow_pickle=True) # convert to .npy
embeddings = data[:, :-1]
print("data loaded", embeddings.shape)
elif extension == ".npz":
data = np.load(INPUT_PATH, allow_pickle=True)
embeddings = data["embeddings"] # convert to NPZ
# Save to path
pd.DataFrame(data["filelist"]).to_csv("filelist.csv", header=False, index=False)
print("Filelist extracted and saved!")
print("data loaded", embeddings.shape)
def calculate_pca(embeddings, pca_dimsize):
model = PCA(n_components=pca_dimsize)
return model.fit_transform(embeddings)
if __name__ == "__main__":
file_name, extension = os.path.splitext(INPUT_PATH)
embeddings = read_embeddings(INPUT_PATH, extension)
pca_out = calculate_pca(embeddings, PCA_DIMSIZE)
np.save(os.path.join(f"pca/pca_{file_name}_{PCA_DIMSIZE}dim"), pca_out)
print(f"PCA Calculated and saved: pca/pca_{file_name}_{PCA_DIMSIZE}dim")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment