kudkudak · June 2, 2015 20:11
diff --git a/gistfile1.py b/gistfile1.py
 def get_sabina_clusters(protein, fingerprint):

    # Read data
    actives, inactives = get_protein_fingerprint_cluster_files(protein, fingerprint)
    clusters_active = []
    for a, _ in actives:
        clusters_active.append(pd.io.parsers.read_csv(os.path.join(c["DATA_DIR"], a), header=None).as_matrix().astype("float32"))

    inactive = pd.io.parsers.read_csv(os.path.join(c["DATA_DIR"], inactives), header=None).as_matrix().astype("float32")

    # Standarize sizes
    max_cols = max(inactive.shape[1], max(cl.shape[1] for cl in clusters_active))
    for cl_id, cl in enumerate(clusters_active):
        if cl.shape[1] != max_cols:
            clusters_active[cl_id] = np.hstack([cl, np.zeros(shape=(cl.shape[0], max_cols - cl.shape[1]))])
    if inactive.shape[1] != max_cols:
            inactive = np.hstack([inactive, np.zeros(shape=(inactive.shape[0], max_cols - inactive.shape[1]))])


    # Start with biggest
    biggest_id = np.argsort([-cluster.shape[0] for cluster in clusters_active])[0]
    X = clusters_active[biggest_id]

    clusters_active_ids = [range(X.shape[0])]
    max_id = X.shape[0] - 1
    for cluster_id, cluster in enumerate(clusters_active):
        if cluster_id != biggest_id:
            if not np.isfinite(cluster).all():
                raise ValueError("F*CK, nan in cluster file.")
            similarities = np.min(pairwise_distances(cluster, X, metric='l1'), axis=1)
            X = np.vstack([X, cluster[similarities!=0]])
            start_id = max_id + 1
            existing = list(np.where(similarities==0)[0])
            clusters_active_ids.append(existing + range(start_id, start_id +  (similarities!=0).sum()))
            max_id = max(max_id, max(clusters_active_ids[-1])) # Update max_id
            assert(len(clusters_active_ids[-1]) == cluster.shape[0])
      
    # Add inactives
    X = np.vstack([X, inactive ])
    inactive_ids = range(max_id+1, max_id+1+inactive.shape[0])
    
    # Labels
    Y = np.zeros(shape=(X.shape[0], 1))
    Y[:] = 1
    Y[max_id+1:] = -1
    
    return X, Y, [np.array(cl).reshape(-1) for cl in clusters_active_ids], np.array(inactive_ids).reshape(-1)
	def get_sabina_clusters(protein, fingerprint):

	# Read data
	actives, inactives = get_protein_fingerprint_cluster_files(protein, fingerprint)
	clusters_active = []
	for a, _ in actives:
	clusters_active.append(pd.io.parsers.read_csv(os.path.join(c["DATA_DIR"], a), header=None).as_matrix().astype("float32"))

	inactive = pd.io.parsers.read_csv(os.path.join(c["DATA_DIR"], inactives), header=None).as_matrix().astype("float32")

	# Standarize sizes
	max_cols = max(inactive.shape[1], max(cl.shape[1] for cl in clusters_active))
	for cl_id, cl in enumerate(clusters_active):
	if cl.shape[1] != max_cols:
	clusters_active[cl_id] = np.hstack([cl, np.zeros(shape=(cl.shape[0], max_cols - cl.shape[1]))])
	if inactive.shape[1] != max_cols:
	inactive = np.hstack([inactive, np.zeros(shape=(inactive.shape[0], max_cols - inactive.shape[1]))])


	# Start with biggest
	biggest_id = np.argsort([-cluster.shape[0] for cluster in clusters_active])[0]
	X = clusters_active[biggest_id]

	clusters_active_ids = [range(X.shape[0])]
	max_id = X.shape[0] - 1
	for cluster_id, cluster in enumerate(clusters_active):
	if cluster_id != biggest_id:
	if not np.isfinite(cluster).all():
	raise ValueError("F*CK, nan in cluster file.")
	similarities = np.min(pairwise_distances(cluster, X, metric='l1'), axis=1)
	X = np.vstack([X, cluster[similarities!=0]])
	start_id = max_id + 1
	existing = list(np.where(similarities==0)[0])
	clusters_active_ids.append(existing + range(start_id, start_id + (similarities!=0).sum()))
	max_id = max(max_id, max(clusters_active_ids[-1])) # Update max_id
	assert(len(clusters_active_ids[-1]) == cluster.shape[0])

	# Add inactives
	X = np.vstack([X, inactive ])
	inactive_ids = range(max_id+1, max_id+1+inactive.shape[0])

	# Labels
	Y = np.zeros(shape=(X.shape[0], 1))
	Y[:] = 1
	Y[max_id+1:] = -1

	return X, Y, [np.array(cl).reshape(-1) for cl in clusters_active_ids], np.array(inactive_ids).reshape(-1)