Skip to content

Instantly share code, notes, and snippets.

@kudkudak
Created June 2, 2015 20:11
Show Gist options
  • Save kudkudak/9a74ebef2368331847cb to your computer and use it in GitHub Desktop.
Save kudkudak/9a74ebef2368331847cb to your computer and use it in GitHub Desktop.
def get_sabina_clusters(protein, fingerprint):
# Read data
actives, inactives = get_protein_fingerprint_cluster_files(protein, fingerprint)
clusters_active = []
for a, _ in actives:
clusters_active.append(pd.io.parsers.read_csv(os.path.join(c["DATA_DIR"], a), header=None).as_matrix().astype("float32"))
inactive = pd.io.parsers.read_csv(os.path.join(c["DATA_DIR"], inactives), header=None).as_matrix().astype("float32")
# Standarize sizes
max_cols = max(inactive.shape[1], max(cl.shape[1] for cl in clusters_active))
for cl_id, cl in enumerate(clusters_active):
if cl.shape[1] != max_cols:
clusters_active[cl_id] = np.hstack([cl, np.zeros(shape=(cl.shape[0], max_cols - cl.shape[1]))])
if inactive.shape[1] != max_cols:
inactive = np.hstack([inactive, np.zeros(shape=(inactive.shape[0], max_cols - inactive.shape[1]))])
# Start with biggest
biggest_id = np.argsort([-cluster.shape[0] for cluster in clusters_active])[0]
X = clusters_active[biggest_id]
clusters_active_ids = [range(X.shape[0])]
max_id = X.shape[0] - 1
for cluster_id, cluster in enumerate(clusters_active):
if cluster_id != biggest_id:
if not np.isfinite(cluster).all():
raise ValueError("F*CK, nan in cluster file.")
similarities = np.min(pairwise_distances(cluster, X, metric='l1'), axis=1)
X = np.vstack([X, cluster[similarities!=0]])
start_id = max_id + 1
existing = list(np.where(similarities==0)[0])
clusters_active_ids.append(existing + range(start_id, start_id + (similarities!=0).sum()))
max_id = max(max_id, max(clusters_active_ids[-1])) # Update max_id
assert(len(clusters_active_ids[-1]) == cluster.shape[0])
# Add inactives
X = np.vstack([X, inactive ])
inactive_ids = range(max_id+1, max_id+1+inactive.shape[0])
# Labels
Y = np.zeros(shape=(X.shape[0], 1))
Y[:] = 1
Y[max_id+1:] = -1
return X, Y, [np.array(cl).reshape(-1) for cl in clusters_active_ids], np.array(inactive_ids).reshape(-1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment