Given a NCBI feature table that lists coding sequences, just wanted to calculate length of non-coding regions with pandas. This is obviously easy when you iterate a dataframe, but that can be painfully slow. Instead, here's how to do it in a "pandas-like" way.
def calc_nc_len(df):
# Before and after here merely means coordinates on the genome disregarding the coding strand of each gene
df["before"] = df["end"].diff()
df["before"] -= df.apply(lambda x: x["end"] - x["start"],axis=1)
df["after"] = -df["start"].diff(periods=-1)
df["after"] -= df.apply(lambda x: x["end"] - x["start"],axis=1)
# Now up and down take the coding strand into account
df["up_utr_len"] = df.apply(lambda x: x["before"] if x["strand"] == "+" else x["after"], axis=1)
df["down_utr_len"] = df.apply(lambda x: x["after"] if x["strand"] == "+" else x["before"], axis=1)
df = df.drop(["before", "after"], axis=1)
return df
# data prep
df = pd.read_csv("../table.txt", sep="\t")
df = df[df["# feature"]=="CDS"]
df = df.reset_index()
nc_len = calc_nc_len(df)