gregcaporaso · December 27, 2015 01:09
diff --git a/compare_pre_post_distances.py b/compare_pre_post_distances.py
 #!/usr/bin/env python
 # Authors: Greg Caporaso, John Chase
 # Questions: Contact [email protected]

 # Step 1: Generate lists of pre/post sample ids on a per-individual basis
 # qiime.group.extract_per_individual_states_from_sample_metadata 
 # will let you generate a dict of individual id to (pre sample-id, post sample-id)

 # Step 2: Extract distances for pre/post sample ids
 # qiime.parse.parse_distmat_to_dict 
 # will let you generate a 2d dict where you can look up the pre
 # sample-id and post sample-id to get the distance between them.

 # Step 3: Compare distributions of pre/post distances
 # cogent.maths.stats.test.mc_t_two_sample 
 # will let you provide two distributions and will perform a monte carlo test.

 # The goal here will be to generate the distribution of per-individual pre/post distances for the UDCA and placebo groups, and compare those distributions with a one-tailed, two-sample t-test. If you pass placebo distribution as "x_items" and the UDCA distribution as "y_items" to cogent.maths.stats.test.mc_t_two_sample, you'd pass tails="high" to get the one sample test. 

 from collections import defaultdict
 from cogent.maths.stats.test import mc_t_two_sample 
 from qiime.group import extract_per_individual_states_from_sample_metadata
 from qiime.parse import parse_mapping_file_to_dict
 from qiime.parse import parse_distmat_to_dict 

 def compare_pre_post_distances(sample_metadata,
                               state_category,
                               state_values,
                               individual_identifier_category,
                               group_category,
                               dm):
    pre_post_sample_ids = extract_per_individual_states_from_sample_metadata(
                            sample_metadata,
                            state_category,
                            state_values,
                            individual_identifier_category)
    
    group_to_individual_identifiers = defaultdict(set)
    for sm in sample_metadata.values():
        individual_id = sm[individual_identifier_category]
        group = sm[group_category]
        group_to_individual_identifiers[group].add(individual_id)
    
    distances = {}
    for individual_id, sample_ids in pre_post_sample_ids.items():
        distances[individual_id] = dm[sample_ids[0]][sample_ids[1]]
    
    distance_group_distributions = {}
    for group, individual_ids in group_to_individual_identifiers.items():
        dist = []
        for individual_id in individual_ids:
            try:
                dist.append(distances[individual_id])
            except KeyError:
                pass
                
        distance_group_distributions[group] = dist
    
    groups = group_to_individual_identifiers.keys()
    for group1 in groups:
        for group2 in groups:
            if group1 != group2:
                t_test_result = mc_t_two_sample(distance_group_distributions[group1],
                                                distance_group_distributions[group2])
                yield group1, group2, t_test_result[0], t_test_result[-1]
    

    
    

 if __name__ == "__main__":
    mapping_fp = "NACP_adjusted_Metadata.txt"
    dm_fp = "unweighted_unifrac_dm.txt"
    state_category = "visit"
    state_values = ['pre','post']
    individual_identifier_category = "participant"
    group_category = "treatmentgroup"
    # parse the mapping file
    sample_metadata, _ = parse_mapping_file_to_dict(mapping_fp)
    # parse the distance matrix
    dm = parse_distmat_to_dict(open(dm_fp,'U'))
    # call compare_pre_post_distances
    
    for test_result in compare_pre_post_distances(sample_metadata,
                               state_category,
                               state_values,
                               individual_identifier_category,
                               group_category,
                               dm):
        print '\t'.join(map(str,test_result))
	#!/usr/bin/env python
	# Authors: Greg Caporaso, John Chase
	# Questions: Contact [email protected]

	# Step 1: Generate lists of pre/post sample ids on a per-individual basis
	# qiime.group.extract_per_individual_states_from_sample_metadata
	# will let you generate a dict of individual id to (pre sample-id, post sample-id)

	# Step 2: Extract distances for pre/post sample ids
	# qiime.parse.parse_distmat_to_dict
	# will let you generate a 2d dict where you can look up the pre
	# sample-id and post sample-id to get the distance between them.

	# Step 3: Compare distributions of pre/post distances
	# cogent.maths.stats.test.mc_t_two_sample
	# will let you provide two distributions and will perform a monte carlo test.

	# The goal here will be to generate the distribution of per-individual pre/post distances for the UDCA and placebo groups, and compare those distributions with a one-tailed, two-sample t-test. If you pass placebo distribution as "x_items" and the UDCA distribution as "y_items" to cogent.maths.stats.test.mc_t_two_sample, you'd pass tails="high" to get the one sample test.

	from collections import defaultdict
	from cogent.maths.stats.test import mc_t_two_sample
	from qiime.group import extract_per_individual_states_from_sample_metadata
	from qiime.parse import parse_mapping_file_to_dict
	from qiime.parse import parse_distmat_to_dict

	def compare_pre_post_distances(sample_metadata,
	state_category,
	state_values,
	individual_identifier_category,
	group_category,
	dm):
	pre_post_sample_ids = extract_per_individual_states_from_sample_metadata(
	sample_metadata,
	state_category,
	state_values,
	individual_identifier_category)

	group_to_individual_identifiers = defaultdict(set)
	for sm in sample_metadata.values():
	individual_id = sm[individual_identifier_category]
	group = sm[group_category]
	group_to_individual_identifiers[group].add(individual_id)

	distances = {}
	for individual_id, sample_ids in pre_post_sample_ids.items():
	distances[individual_id] = dm[sample_ids[0]][sample_ids[1]]

	distance_group_distributions = {}
	for group, individual_ids in group_to_individual_identifiers.items():
	dist = []
	for individual_id in individual_ids:
	try:
	dist.append(distances[individual_id])
	except KeyError:
	pass

	distance_group_distributions[group] = dist

	groups = group_to_individual_identifiers.keys()
	for group1 in groups:
	for group2 in groups:
	if group1 != group2:
	t_test_result = mc_t_two_sample(distance_group_distributions[group1],
	distance_group_distributions[group2])
	yield group1, group2, t_test_result[0], t_test_result[-1]





	if __name__ == "__main__":
	mapping_fp = "NACP_adjusted_Metadata.txt"
	dm_fp = "unweighted_unifrac_dm.txt"
	state_category = "visit"
	state_values = ['pre','post']
	individual_identifier_category = "participant"
	group_category = "treatmentgroup"
	# parse the mapping file
	sample_metadata, _ = parse_mapping_file_to_dict(mapping_fp)
	# parse the distance matrix
	dm = parse_distmat_to_dict(open(dm_fp,'U'))
	# call compare_pre_post_distances

	for test_result in compare_pre_post_distances(sample_metadata,
	state_category,
	state_values,
	individual_identifier_category,
	group_category,
	dm):
	print '\t'.join(map(str,test_result))