Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Last active December 27, 2015 01:09
Show Gist options
  • Save gregcaporaso/7242529 to your computer and use it in GitHub Desktop.
Save gregcaporaso/7242529 to your computer and use it in GitHub Desktop.
Code for comparing groups (e.g., treatment and control) of pre/post UniFrac distances to determine if one group's microbiomes are more stable than the other.
#!/usr/bin/env python
# Authors: Greg Caporaso, John Chase
# Questions: Contact [email protected]
# Step 1: Generate lists of pre/post sample ids on a per-individual basis
# qiime.group.extract_per_individual_states_from_sample_metadata
# will let you generate a dict of individual id to (pre sample-id, post sample-id)
# Step 2: Extract distances for pre/post sample ids
# qiime.parse.parse_distmat_to_dict
# will let you generate a 2d dict where you can look up the pre
# sample-id and post sample-id to get the distance between them.
# Step 3: Compare distributions of pre/post distances
# cogent.maths.stats.test.mc_t_two_sample
# will let you provide two distributions and will perform a monte carlo test.
# The goal here will be to generate the distribution of per-individual pre/post distances for the UDCA and placebo groups, and compare those distributions with a one-tailed, two-sample t-test. If you pass placebo distribution as "x_items" and the UDCA distribution as "y_items" to cogent.maths.stats.test.mc_t_two_sample, you'd pass tails="high" to get the one sample test.
from collections import defaultdict
from cogent.maths.stats.test import mc_t_two_sample
from qiime.group import extract_per_individual_states_from_sample_metadata
from qiime.parse import parse_mapping_file_to_dict
from qiime.parse import parse_distmat_to_dict
def compare_pre_post_distances(sample_metadata,
state_category,
state_values,
individual_identifier_category,
group_category,
dm):
pre_post_sample_ids = extract_per_individual_states_from_sample_metadata(
sample_metadata,
state_category,
state_values,
individual_identifier_category)
group_to_individual_identifiers = defaultdict(set)
for sm in sample_metadata.values():
individual_id = sm[individual_identifier_category]
group = sm[group_category]
group_to_individual_identifiers[group].add(individual_id)
distances = {}
for individual_id, sample_ids in pre_post_sample_ids.items():
distances[individual_id] = dm[sample_ids[0]][sample_ids[1]]
distance_group_distributions = {}
for group, individual_ids in group_to_individual_identifiers.items():
dist = []
for individual_id in individual_ids:
try:
dist.append(distances[individual_id])
except KeyError:
pass
distance_group_distributions[group] = dist
groups = group_to_individual_identifiers.keys()
for group1 in groups:
for group2 in groups:
if group1 != group2:
t_test_result = mc_t_two_sample(distance_group_distributions[group1],
distance_group_distributions[group2])
yield group1, group2, t_test_result[0], t_test_result[-1]
if __name__ == "__main__":
mapping_fp = "NACP_adjusted_Metadata.txt"
dm_fp = "unweighted_unifrac_dm.txt"
state_category = "visit"
state_values = ['pre','post']
individual_identifier_category = "participant"
group_category = "treatmentgroup"
# parse the mapping file
sample_metadata, _ = parse_mapping_file_to_dict(mapping_fp)
# parse the distance matrix
dm = parse_distmat_to_dict(open(dm_fp,'U'))
# call compare_pre_post_distances
for test_result in compare_pre_post_distances(sample_metadata,
state_category,
state_values,
individual_identifier_category,
group_category,
dm):
print '\t'.join(map(str,test_result))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment