Skip to content

Instantly share code, notes, and snippets.

@endrebak
Created January 31, 2020 07:57
Show Gist options
  • Save endrebak/5d4215672a05268c7042a89f6525113d to your computer and use it in GitHub Desktop.
Save endrebak/5d4215672a05268c7042a89f6525113d to your computer and use it in GitHub Desktop.
rule gene_biotypes:
input:
regions = gene_biotype_infiles,
annotation = "{prefix}/data/{genome}/annotation.tsv"
output:
"{prefix}/data/{genome}/{hmm_or_anatomy}_regions/{cutoff}/gene_biotype_counts.tsv"
run:
df = pd.read_table(input.regions, header=0)
df2 = pd.read_csv(input.annotation, header=0, sep=",")
total = df2.gene_type.value_counts()
to_include = total.head(10).index.get_level_values(0).tolist()
total = total[total.index.get_level_values(0).isin(to_include)]
# print(to_include)
gr = pr.PyRanges(df).drop_duplicate_positions()
gr2 = pr.PyRanges(df2)
tss = gr2.features.tss()
gr2 = tss.slack(3000)
j = gr.nearest(gr2)
# print(j)
j = j[j.gene_type.isin(to_include)]
# print(j)
vc = j.gene_type.value_counts()
tp = vc
fp = vc.sum() - tp
tn = total.sum() - total - fp
tn[tn.isna()] = total.sum() - total
fn = total - tp
fn[fn.isna()] = total
fe = pd.concat([tp, fp, fn, tn], axis=1).fillna(0).astype(int)
fe.columns = "TP FP FN TN".split()
df = pr.stats.fisher_exact(fe.TP, fe.FP, fe.FN, fe.TN, pseudocount=0.01)
df.index = fe.index
df = df.reset_index()
df.insert(df.shape[1], "FDR", pr.stats.fdr(df.P))
fdr_bin = pd.cut(df.FDR, bins=[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1])
df.insert(df.shape[1], "FDRBin", pd.cut(df.FDR, bins=[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1]))
df.columns = df.columns.str.replace("index", "Type")
print(df.head())
df.to_csv(output[0], sep="\t", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment