Skip to content

Instantly share code, notes, and snippets.

@wasade
Last active August 9, 2021 14:58
Show Gist options
  • Save wasade/74cfd90840e22a462fbd5c807529c154 to your computer and use it in GitHub Desktop.
Save wasade/74cfd90840e22a462fbd5c807529c154 to your computer and use it in GitHub Desktop.
summarize q2-demux stats
import qiime2
import pandas as pd
import glob
from collections import defaultdict
import sys
import time, os
details = [l.strip() for l in open(sys.argv[1])]
f = details[int(sys.argv[2]) - 1]
def error_summary(frame):
error_counts = frame['barcode-errors'].value_counts()
error_relative = error_counts / error_counts.sum()
observed = {'absolute-0-bit': 0,
'absolute-1-bit': 0,
'absolute-2-bit': 0,
'absolute-3-bit': 0,
'absolute-4-bit': 0,
'absolute-any-error': 0,
'relative-0-bit': 0,
'relative-1-bit': 0,
'relative-2-bit': 0,
'relative-3-bit': 0,
'relative-4-bit': 0,
'relative-any-error': 0}
for name, series in [('absolute', error_counts),
('relative', error_relative)]:
for bit, number_of_records in series.iteritems():
observed[f'{name}-{bit}-bit'] = number_of_records
observed[f'{name}-any-error'] = series.sum() - observed[f'{name}-0-bit']
return observed
per_run_results = []
per_barcode_results = []
per_sample_results = []
basename = os.path.splitext(os.path.basename(f))[0]
mb = round(os.stat(f).st_size / 2**20, 2)
start = time.time()
print(f"starting {f} MB: {mb}...")
df = qiime2.Artifact.load(f).view(pd.DataFrame)
df['barcode-errors'] = df['barcode-errors'].astype(int)
if not set(df['barcode-errors'].unique()).issubset({0, 1, 2, 3, 4}):
print(f"{f} has unusual barcode errors!")
# (1) what % of index reads per run that have any error?
# (2) what % of index reads have 1 bit, 2 bit, 3 bit, or > 3 bit error per run?
counts_and_fractions_of_error_by_bit = error_summary(df)
counts_and_fractions_of_error_by_bit['run'] = f
per_run_results.append(counts_and_fractions_of_error_by_bit)
# (3) do any barcodes appear more prone to error than others?
only_corrected = df[df['barcode-corrected'] != 'None']
for barcode, barcode_grp in only_corrected.groupby('barcode-corrected'):
barcode_counts_and_fractions_of_error_by_bit = error_summary(barcode_grp)
barcode_counts_and_fractions_of_error_by_bit['run'] = f
barcode_counts_and_fractions_of_error_by_bit['barcode'] = barcode
per_barcode_results.append(barcode_counts_and_fractions_of_error_by_bit)
# We could look at proportion of barcodes with errors in a sample, and ask whether that explains beta diversity?
with_sample = df[df['sample'] != 'None']
for sample, sample_grp in with_sample.groupby('sample'):
sample_counts_and_fractions_of_error_by_bit = error_summary(sample_grp)
sample_counts_and_fractions_of_error_by_bit['run'] = f
sample_counts_and_fractions_of_error_by_bit['sample-id'] = sample
per_sample_results.append(sample_counts_and_fractions_of_error_by_bit)
print(f"ending {f}, time: {time.time() - start}")
per_run_results = pd.DataFrame(per_run_results)
per_barcode_results = pd.DataFrame(per_barcode_results)
per_sample_results = pd.DataFrame(per_sample_results)
per_run_results.to_csv(f'per_run_{basename}.tsv', sep='\t', index=False, header=True)
per_barcode_results.to_csv(f'per_barcode_{basename}.tsv', sep='\t', index=False, header=True)
per_sample_results.to_csv(f'per_sample_{basename}.tsv', sep='\t', index=False, header=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment