geocarvalho · March 4, 2021 04:14
diff --git a/bed_to_list.py b/bed_to_list.py
 import pandas as pd

 file = "input.bed"
 df = pd.read_csv(file, sep="\t", names=["chr", "start", "end", "interval", "score", "strand"])
 df[["gene", "extra"]] = df["interval"].str.split("_", 1, expand=True)
 df.drop(["interval", "score", "strand", "extra"], axis=1, inplace=True)
 new_df = df.groupby("gene").agg({"chr":"unique", "start":min, "end":max})
 new_df.reset_index(inplace=True)
 new_df["chr"] = new_df["chr"].apply(lambda chr: chr[0])
 new_df["start"] = new_df["start"].astype("str")
 new_df["end"] = new_df["end"].astype("str")
 new_df["list"] = new_df["chr"] + ":" + new_df["start"] + "-" + new_df["end"]
 new_df["list"] = new_df["list"].str.replace("chr", "")
 new_df["chr"] = new_df["chr"].str.replace("chr", "")
 new_df["chr"] = new_df["chr"].str.replace("X", "23")
 new_df["chr"] = new_df["chr"].str.replace("Y", "24")
 new_df[["chr", "start", "end"]] = new_df[["chr", "start", "end"]].astype(int)
 new_df.sort_values(by=["chr", "start", "end"], inplace=True)
 new_df["list"].to_csv("input.list", index=False, header=False)
	import pandas as pd

	file = "input.bed"
	df = pd.read_csv(file, sep="\t", names=["chr", "start", "end", "interval", "score", "strand"])
	df[["gene", "extra"]] = df["interval"].str.split("_", 1, expand=True)
	df.drop(["interval", "score", "strand", "extra"], axis=1, inplace=True)
	new_df = df.groupby("gene").agg({"chr":"unique", "start":min, "end":max})
	new_df.reset_index(inplace=True)
	new_df["chr"] = new_df["chr"].apply(lambda chr: chr[0])
	new_df["start"] = new_df["start"].astype("str")
	new_df["end"] = new_df["end"].astype("str")
	new_df["list"] = new_df["chr"] + ":" + new_df["start"] + "-" + new_df["end"]
	new_df["list"] = new_df["list"].str.replace("chr", "")
	new_df["chr"] = new_df["chr"].str.replace("chr", "")
	new_df["chr"] = new_df["chr"].str.replace("X", "23")
	new_df["chr"] = new_df["chr"].str.replace("Y", "24")
	new_df[["chr", "start", "end"]] = new_df[["chr", "start", "end"]].astype(int)
	new_df.sort_values(by=["chr", "start", "end"], inplace=True)
	new_df["list"].to_csv("input.list", index=False, header=False)
No results found