Created
January 31, 2018 13:10
-
-
Save walterst/ecc2232c07d2011d7fdd2c06fb0071a8 to your computer and use it in GitHub Desktop.
Generate rank/frequency (and log-transformed) data for OTU counts to match approach described in article listed in script text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from sys import argv | |
from operator import itemgetter | |
from scipy.stats import rankdata | |
from numpy import log | |
from biom import load_table | |
""" From the figure 1 approach in http://jem.rupress.org/content/209/2/365 | |
Rank = # of times sequence appears | |
Frequency of rank = how many of a given rank # of present, e.g., should be many rank 1 | |
implication is that those with high rank will be active pool of B-cells. | |
""" | |
otu_table = load_table(argv[1]) | |
ids = otu_table._sample_ids | |
obs = otu_table._observation_ids | |
exceptions = ["WT.sp4", "WT.LI4"] # outlier | |
per_id_outfs = [x + ".txt" for x in ids] | |
id_outf = [] | |
for f in per_id_outfs: | |
id_outf.append(open(f, "w")) | |
combined_outf = open("combined_data.txt", "w") | |
combined_filtered_outf = open("combined_data_no4.txt", "w") | |
combined_counts = {} | |
combined_counts_filtered = {} | |
for curr_otu in obs: | |
count = 0 | |
count_filtered = 0 | |
for id in ids: | |
curr_val = otu_table.get_value_by_ids(curr_otu, id) | |
count += curr_val | |
if id not in exceptions: | |
count_filtered += curr_val | |
# Skip if zero count OTU, e.g. zero after rarefaction | |
if count > 0: | |
try: | |
combined_counts[count] += 1 | |
except KeyError: | |
combined_counts[count] = 1 | |
if count_filtered > 0: | |
try: | |
combined_counts_filtered[count_filtered] += 1 | |
except KeyError: | |
combined_counts_filtered[count_filtered] = 1 | |
id_dicts = {} | |
for id in ids: | |
id_dicts[id] = {} | |
for id in ids: | |
for curr_otu in obs: | |
curr_val = otu_table.get_value_by_ids(curr_otu, id) | |
if curr_val > 0: | |
try: | |
id_dicts[id][curr_val] += 1 | |
except KeyError: | |
id_dicts[id][curr_val] = 1 | |
sorted_per_sample = [] | |
for id in ids: | |
curr_l = sorted(id_dicts[id].iteritems(), key=itemgetter(0), reverse=False) | |
sorted_per_sample.append(curr_l) | |
combined_counts_sorted = sorted(combined_counts.iteritems(), key=itemgetter(0), reverse=False) | |
combined_counts_filtered_sorted = sorted(combined_counts_filtered.iteritems(), key=itemgetter(0), reverse=False) | |
combined_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n") | |
for x in combined_counts_sorted: | |
combined_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1]))) | |
combined_filtered_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n") | |
for x in combined_counts_filtered_sorted: | |
combined_filtered_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1]))) | |
for f in id_outf: | |
f.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n") | |
for f in range(len(id_outf)): | |
for x in sorted_per_sample[f]: | |
id_outf[f].write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1]))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment