Last active
August 29, 2015 13:56
-
-
Save gregcaporaso/9234421 to your computer and use it in GitHub Desktop.
first pass script for comparing median diversity versus median variability for a given category
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# File created on 26 Feb 2014 | |
from __future__ import division | |
__author__ = "Greg Caporaso" | |
__copyright__ = "Copyright 2014, The QIIME Project" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "1.8.0-dev" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
from numpy import median | |
import matplotlib.pyplot as plt | |
from cogent.maths.stats.test import correlation_test | |
from qiime.parse import parse_distmat | |
from qiime.parse import parse_mapping_file | |
from qiime.parse import parse_rarefaction | |
from qiime.group import get_grouped_distances | |
from qiime.compare_alpha_diversity import ( | |
collapse_sample_diversities_by_category_value, | |
get_per_sample_average_diversities, get_category_value_to_sample_ids) | |
from qiime.util import parse_command_line_parameters, make_option | |
script_info = {} | |
script_info['brief_description'] = "" | |
script_info['script_description'] = "" | |
# Members of the tuple in script_usage are (title, description, example call) | |
script_info['script_usage'] = [("","","%prog -m smp-gut.tsv -d " | |
"unweighted_unifrac_dm.gut_ts_only.txt -a PD_whole_tree.txt -c PersonalID " | |
"-o out.txt -p out.pdf")] | |
script_info['output_description']= "" | |
script_info['required_options'] = [ | |
make_option('-m', '--mapping_fp', type='existing_filepath', | |
help='the input mapping file'), | |
make_option('-d', '--distance_matrix_fp', type='existing_filepath', | |
help='the input distance matrix file'), | |
make_option('-a', '--collated_alpha_fp', type='existing_filepath', | |
help='the input alpha diversity table'), | |
make_option('-c', '--category', | |
help='grouping category (e.g., "PersonalID")'), | |
make_option('-o', '--output_fp', type="new_filepath", | |
help='the output filepath') | |
] | |
script_info['optional_options'] = [ | |
make_option('-p', '--plot_output_fp', type="new_filepath", | |
help='the output filepath for the figure [default: no plot created]') | |
] | |
script_info['version'] = __version__ | |
def main(): | |
option_parser, opts, args = parse_command_line_parameters(**script_info) | |
distance_matrix_fp = opts.distance_matrix_fp | |
mapping_fp = opts.mapping_fp | |
collated_alpha_fp = opts.collated_alpha_fp | |
category = opts.category | |
output_fp = opts.output_fp | |
plot_output_fp = opts.plot_output_fp | |
dmh, dm = parse_distmat(open(distance_matrix_fp)) | |
m, mh, _ = parse_mapping_file(open(mapping_fp)) | |
within_indiv = get_grouped_distances(dmh, dm, mh, m, category) | |
per_indiv_median_variability = \ | |
dict([(e[0], median(e[2])) for e in within_indiv]) | |
rarefaction_data = parse_rarefaction(open(collated_alpha_fp)) | |
category_value_to_sample_ids = \ | |
get_category_value_to_sample_ids(open(mapping_fp), category) | |
per_sample_average_diversities = \ | |
get_per_sample_average_diversities(rarefaction_data, depth=None) | |
per_category_value_average_diversities = \ | |
collapse_sample_diversities_by_category_value( | |
category_value_to_sample_ids, | |
per_sample_average_diversities) | |
per_indiv_median_diversity = \ | |
dict([(k, median(v)) for k,v in | |
per_category_value_average_diversities.items()]) | |
common_ids = \ | |
set(per_indiv_median_diversity) & \ | |
set(per_indiv_median_variability) | |
data = [] | |
for id in common_ids: | |
data.append((id, | |
per_indiv_median_diversity[id], | |
per_indiv_median_variability[id])) | |
diversities = [d[1] for d in data] | |
variabilities = [d[2] for d in data] | |
correlation_result = correlation_test(diversities, variabilities) | |
of = open(output_fp,'w') | |
of.write("#%s\tMedian Diversity\tMedian Variability\n" % category) | |
of.write("#Spearman rho: %f, Spearman Monte Carlo p: %f\n" % | |
(correlation_result[0], correlation_result[3])) | |
for d in data: | |
of.write('%s\t%f\t%f\n' % d) | |
of.close() | |
if plot_output_fp: | |
fig, ax = plt.subplots() | |
ax.scatter([d[1] for d in data], | |
[d[2] for d in data]) | |
ax.set_xlim(0) | |
ax.set_ylim(0) | |
ax.set_xlabel('Median Diversity') | |
ax.set_ylabel('Median Variability') | |
plt.savefig(plot_output_fp) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment