Last active
December 27, 2015 01:09
-
-
Save gregcaporaso/7242529 to your computer and use it in GitHub Desktop.
Code for comparing groups (e.g., treatment and control) of pre/post UniFrac distances to determine if one group's microbiomes are more stable than the other.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Authors: Greg Caporaso, John Chase | |
# Questions: Contact [email protected] | |
# Step 1: Generate lists of pre/post sample ids on a per-individual basis | |
# qiime.group.extract_per_individual_states_from_sample_metadata | |
# will let you generate a dict of individual id to (pre sample-id, post sample-id) | |
# Step 2: Extract distances for pre/post sample ids | |
# qiime.parse.parse_distmat_to_dict | |
# will let you generate a 2d dict where you can look up the pre | |
# sample-id and post sample-id to get the distance between them. | |
# Step 3: Compare distributions of pre/post distances | |
# cogent.maths.stats.test.mc_t_two_sample | |
# will let you provide two distributions and will perform a monte carlo test. | |
# The goal here will be to generate the distribution of per-individual pre/post distances for the UDCA and placebo groups, and compare those distributions with a one-tailed, two-sample t-test. If you pass placebo distribution as "x_items" and the UDCA distribution as "y_items" to cogent.maths.stats.test.mc_t_two_sample, you'd pass tails="high" to get the one sample test. | |
from collections import defaultdict | |
from cogent.maths.stats.test import mc_t_two_sample | |
from qiime.group import extract_per_individual_states_from_sample_metadata | |
from qiime.parse import parse_mapping_file_to_dict | |
from qiime.parse import parse_distmat_to_dict | |
def compare_pre_post_distances(sample_metadata, | |
state_category, | |
state_values, | |
individual_identifier_category, | |
group_category, | |
dm): | |
pre_post_sample_ids = extract_per_individual_states_from_sample_metadata( | |
sample_metadata, | |
state_category, | |
state_values, | |
individual_identifier_category) | |
group_to_individual_identifiers = defaultdict(set) | |
for sm in sample_metadata.values(): | |
individual_id = sm[individual_identifier_category] | |
group = sm[group_category] | |
group_to_individual_identifiers[group].add(individual_id) | |
distances = {} | |
for individual_id, sample_ids in pre_post_sample_ids.items(): | |
distances[individual_id] = dm[sample_ids[0]][sample_ids[1]] | |
distance_group_distributions = {} | |
for group, individual_ids in group_to_individual_identifiers.items(): | |
dist = [] | |
for individual_id in individual_ids: | |
try: | |
dist.append(distances[individual_id]) | |
except KeyError: | |
pass | |
distance_group_distributions[group] = dist | |
groups = group_to_individual_identifiers.keys() | |
for group1 in groups: | |
for group2 in groups: | |
if group1 != group2: | |
t_test_result = mc_t_two_sample(distance_group_distributions[group1], | |
distance_group_distributions[group2]) | |
yield group1, group2, t_test_result[0], t_test_result[-1] | |
if __name__ == "__main__": | |
mapping_fp = "NACP_adjusted_Metadata.txt" | |
dm_fp = "unweighted_unifrac_dm.txt" | |
state_category = "visit" | |
state_values = ['pre','post'] | |
individual_identifier_category = "participant" | |
group_category = "treatmentgroup" | |
# parse the mapping file | |
sample_metadata, _ = parse_mapping_file_to_dict(mapping_fp) | |
# parse the distance matrix | |
dm = parse_distmat_to_dict(open(dm_fp,'U')) | |
# call compare_pre_post_distances | |
for test_result in compare_pre_post_distances(sample_metadata, | |
state_category, | |
state_values, | |
individual_identifier_category, | |
group_category, | |
dm): | |
print '\t'.join(map(str,test_result)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment