Skip to content

Instantly share code, notes, and snippets.

@walterst
Created January 31, 2018 13:10
Show Gist options
  • Save walterst/ecc2232c07d2011d7fdd2c06fb0071a8 to your computer and use it in GitHub Desktop.
Save walterst/ecc2232c07d2011d7fdd2c06fb0071a8 to your computer and use it in GitHub Desktop.
Generate rank/frequency (and log-transformed) data for OTU counts to match approach described in article listed in script text.
#!/usr/bin/env python
from sys import argv
from operator import itemgetter
from scipy.stats import rankdata
from numpy import log
from biom import load_table
""" From the figure 1 approach in http://jem.rupress.org/content/209/2/365
Rank = # of times sequence appears
Frequency of rank = how many of a given rank # of present, e.g., should be many rank 1
implication is that those with high rank will be active pool of B-cells.
"""
otu_table = load_table(argv[1])
ids = otu_table._sample_ids
obs = otu_table._observation_ids
exceptions = ["WT.sp4", "WT.LI4"] # outlier
per_id_outfs = [x + ".txt" for x in ids]
id_outf = []
for f in per_id_outfs:
id_outf.append(open(f, "w"))
combined_outf = open("combined_data.txt", "w")
combined_filtered_outf = open("combined_data_no4.txt", "w")
combined_counts = {}
combined_counts_filtered = {}
for curr_otu in obs:
count = 0
count_filtered = 0
for id in ids:
curr_val = otu_table.get_value_by_ids(curr_otu, id)
count += curr_val
if id not in exceptions:
count_filtered += curr_val
# Skip if zero count OTU, e.g. zero after rarefaction
if count > 0:
try:
combined_counts[count] += 1
except KeyError:
combined_counts[count] = 1
if count_filtered > 0:
try:
combined_counts_filtered[count_filtered] += 1
except KeyError:
combined_counts_filtered[count_filtered] = 1
id_dicts = {}
for id in ids:
id_dicts[id] = {}
for id in ids:
for curr_otu in obs:
curr_val = otu_table.get_value_by_ids(curr_otu, id)
if curr_val > 0:
try:
id_dicts[id][curr_val] += 1
except KeyError:
id_dicts[id][curr_val] = 1
sorted_per_sample = []
for id in ids:
curr_l = sorted(id_dicts[id].iteritems(), key=itemgetter(0), reverse=False)
sorted_per_sample.append(curr_l)
combined_counts_sorted = sorted(combined_counts.iteritems(), key=itemgetter(0), reverse=False)
combined_counts_filtered_sorted = sorted(combined_counts_filtered.iteritems(), key=itemgetter(0), reverse=False)
combined_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
for x in combined_counts_sorted:
combined_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))
combined_filtered_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
for x in combined_counts_filtered_sorted:
combined_filtered_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))
for f in id_outf:
f.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
for f in range(len(id_outf)):
for x in sorted_per_sample[f]:
id_outf[f].write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment