Last active
August 31, 2015 23:22
-
-
Save lomereiter/74b7d4a962d1e1ccb07e to your computer and use it in GitHub Desktop.
Metabolomics pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
# path to pyIMS parent dir | |
sys.path.append("/home/lomereiter/github") | |
from pyIMS.image_measures.level_sets_measure import measure_of_chaos | |
from pyIMS.image_measures.isotope_image_correlation import isotope_image_correlation | |
from pyIMS.image_measures.isotope_pattern_match import isotope_pattern_match | |
import numpy as np | |
import cPickle | |
def txt_to_spectrum(s): | |
arr = s.strip().split("|") | |
# f4 is used to store data in HDF5; for cumsum we need precision, thus f8 | |
intensities = np.fromstring("0 " + arr[2], sep=' ', dtype=np.dtype('f4')) | |
return (int(arr[0]), np.fromstring(arr[1], sep=' ', dtype=np.dtype('f4')), | |
np.cumsum(intensities, dtype=np.dtype('f8'))) | |
def numpy_multiple_queries(lower, upper, lperm, uperm, mzs, cumsum_int, | |
result, pixel, tmp1, tmp2): | |
tmp1[lperm] = np.searchsorted(mzs, lower, 'l') | |
tmp2[uperm] = np.searchsorted(mzs, upper, 'r') | |
result[:, pixel] += cumsum_int[tmp2] - cumsum_int[tmp1] | |
def process_spectra_multiple_queries(mol_mz_intervals, spectra, m): | |
lower, upper, lperm, uperm = mol_mz_intervals | |
n = len(lower) | |
result = np.zeros((n, m)) | |
query_ids = np.zeros(n, dtype=np.int) | |
intensities = np.zeros(n) | |
tmp1 = np.zeros(n, dtype=np.int) | |
tmp2 = np.zeros(n, dtype=np.int) | |
for sp in spectra: | |
pixel, mzs, cumsum_int = sp | |
numpy_multiple_queries( | |
lower, upper, lperm, uperm, mzs, cumsum_int, | |
result, pixel, tmp1, tmp2) | |
return result | |
def get_ion_images(spectra, q, nrows, ncols, ppm): | |
peaks = q["mzpeaks"] | |
query_lens = np.array(map(len, peaks)) | |
mzs = np.array([s for _q in peaks for s in _q]) | |
tols = mzs * ppm / 1e6 | |
lower = mzs - tols | |
upper = mzs + tols | |
lower_order = lower.argsort() | |
lower_sorted = lower[lower_order] | |
upper_order = upper.argsort() | |
upper_sorted = upper[upper_order] | |
bounds = (lower_sorted, upper_sorted, lower_order, upper_order) | |
qres = process_spectra_multiple_queries(bounds, spectra, nrows * ncols) | |
qres = np.split(qres, np.cumsum(query_lens)[:-1]) | |
return qres | |
def compute_scores(qres, q, nrows, ncols, nlvl): | |
m = len(qres) | |
measure_value_score = np.zeros(m) | |
iso_ratio_score = np.zeros(m) | |
iso_correlation_score = np.ones(m) | |
def process(i, intensities): | |
d = qres[i] | |
img = d[0].reshape((nrows, ncols)) | |
# measure_of_chaos modifies image in-place, so provide it with a copy | |
measure_value_score[i] = 1-measure_of_chaos(img.copy(),nlvl,interp=False)[0] | |
if measure_value_score[i] == 1: measure_value_score[i] = 0 | |
if len(d) > 1: | |
iso_corr = isotope_image_correlation(d, weights=intensities[1:]) | |
iso_correlation_score[i] = iso_corr | |
iso_ratio_score[i] = isotope_pattern_match(d, intensities) | |
for i in xrange(len(qres)): | |
process(i, np.array(q["intensities"][i])) | |
r=[iso_ratio_score, iso_correlation_score, measure_value_score] | |
return r | |
def run_pipeline(spectra_fn, queries_fn, out_fn, nrows, ncols, nlvl, ppm): | |
print "reading queries" | |
with open(queries_fn) as f: | |
q = cPickle.load(f) | |
chunk_size = 1000 | |
n_chunks = len(q["mzpeaks"]) / chunk_size + 1 | |
scores = [[], [], []] | |
chunk_scores = [] | |
print "reading spectra" | |
spectra_str = open(spectra_fn).readlines() | |
spectra = map(txt_to_spectrum, spectra_str) | |
for offset in xrange(0, len(q["mzpeaks"]), chunk_size): | |
print("processing chunk #%d/%d" % (offset / chunk_size + 1, n_chunks)) | |
q_chunk = {} | |
for key in q.keys(): | |
q_chunk[key] = q[key][offset:offset+chunk_size] | |
print "\tgetting ion images" | |
qres = get_ion_images(spectra, q_chunk, nrows, ncols, ppm) | |
print "\tcomputing scores" | |
chunk_scores.append(compute_scores(qres, q_chunk, nrows, ncols, nlvl)) | |
qres = None | |
import gc | |
gc.collect() | |
for i in xrange(3): | |
scores[i] = np.concatenate([x[i] for x in chunk_scores]) | |
print "dumping scores to disk" | |
with open(out_fn, 'w') as f: | |
cPickle.dump(scores, f) | |
run_pipeline("/home/lomereiter/metabolomics/bladder.txt", # dataset in Sergey's format | |
"/home/lomereiter/metabolomics/queries.pkl", | |
"/home/lomereiter/metabolomics/scores_bladder.pkl", # output filename | |
134, 260, 30, # 134x260 image, 30 levels to use in measure_of_chaos | |
100.0 # ppm | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment