endrebak · January 31, 2020 07:57
diff --git a/fisher_exact_gene_biotypes.py b/fisher_exact_gene_biotypes.py
 rule gene_biotypes:
    input:
        regions = gene_biotype_infiles,
        annotation = "{prefix}/data/{genome}/annotation.tsv"
    output:
        "{prefix}/data/{genome}/{hmm_or_anatomy}_regions/{cutoff}/gene_biotype_counts.tsv"
    run:
        df = pd.read_table(input.regions, header=0)
        df2 = pd.read_csv(input.annotation, header=0, sep=",")

        total = df2.gene_type.value_counts()

        to_include = total.head(10).index.get_level_values(0).tolist()
        total = total[total.index.get_level_values(0).isin(to_include)]
        # print(to_include)

        gr = pr.PyRanges(df).drop_duplicate_positions()
        gr2 = pr.PyRanges(df2)

        tss = gr2.features.tss()

        gr2 = tss.slack(3000)

        j = gr.nearest(gr2)

        # print(j)
        j = j[j.gene_type.isin(to_include)]
        # print(j)

        vc = j.gene_type.value_counts()

        tp = vc
        fp = vc.sum() - tp
        tn = total.sum() - total - fp
        tn[tn.isna()] = total.sum() - total
        fn = total - tp
        fn[fn.isna()] = total

        fe = pd.concat([tp, fp, fn, tn], axis=1).fillna(0).astype(int)
        fe.columns = "TP FP FN TN".split()

        df = pr.stats.fisher_exact(fe.TP, fe.FP, fe.FN, fe.TN, pseudocount=0.01)

        df.index = fe.index
        df = df.reset_index()
        df.insert(df.shape[1], "FDR", pr.stats.fdr(df.P))

        fdr_bin = pd.cut(df.FDR, bins=[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1])
        df.insert(df.shape[1], "FDRBin", pd.cut(df.FDR, bins=[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1]))

        df.columns = df.columns.str.replace("index", "Type")
        print(df.head())

        df.to_csv(output[0], sep="\t", index=False)
	rule gene_biotypes:
	input:
	regions = gene_biotype_infiles,
	annotation = "{prefix}/data/{genome}/annotation.tsv"
	output:
	"{prefix}/data/{genome}/{hmm_or_anatomy}_regions/{cutoff}/gene_biotype_counts.tsv"
	run:
	df = pd.read_table(input.regions, header=0)
	df2 = pd.read_csv(input.annotation, header=0, sep=",")

	total = df2.gene_type.value_counts()

	to_include = total.head(10).index.get_level_values(0).tolist()
	total = total[total.index.get_level_values(0).isin(to_include)]
	# print(to_include)

	gr = pr.PyRanges(df).drop_duplicate_positions()
	gr2 = pr.PyRanges(df2)

	tss = gr2.features.tss()

	gr2 = tss.slack(3000)

	j = gr.nearest(gr2)

	# print(j)
	j = j[j.gene_type.isin(to_include)]
	# print(j)

	vc = j.gene_type.value_counts()

	tp = vc
	fp = vc.sum() - tp
	tn = total.sum() - total - fp
	tn[tn.isna()] = total.sum() - total
	fn = total - tp
	fn[fn.isna()] = total

	fe = pd.concat([tp, fp, fn, tn], axis=1).fillna(0).astype(int)
	fe.columns = "TP FP FN TN".split()

	df = pr.stats.fisher_exact(fe.TP, fe.FP, fe.FN, fe.TN, pseudocount=0.01)

	df.index = fe.index
	df = df.reset_index()
	df.insert(df.shape[1], "FDR", pr.stats.fdr(df.P))

	fdr_bin = pd.cut(df.FDR, bins=[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1])
	df.insert(df.shape[1], "FDRBin", pd.cut(df.FDR, bins=[0, 0.001, 0.005, 0.01, 0.05, 0.1, 1]))

	df.columns = df.columns.str.replace("index", "Type")
	print(df.head())

	df.to_csv(output[0], sep="\t", index=False)