Last active
July 26, 2023 17:06
-
-
Save geocarvalho/9f92f49d5051b68514a3bc167736375e to your computer and use it in GitHub Desktop.
(up) need to add other columns
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
from pyliftover import LiftOver | |
import pandas as pd | |
import argparse | |
import mapply | |
import sys | |
import os | |
mapply.init( | |
n_workers=8, | |
chunk_size=100, | |
max_chunks_per_worker=8, | |
progressbar=False | |
) | |
def get_coordinate(row, lo): | |
""" Translate chr, start and end postions from hg19 to hg38 """ | |
chr = row[0] | |
start = row[1] | |
end = row[2] | |
try: | |
# We make start and end separated | |
coord_start = lo.convert_coordinate(chr, start)[0] | |
coord_end = lo.convert_coordinate(chr, end)[0] | |
return [coord_start[0], coord_start[1], coord_start[2], coord_end[1]] | |
except: | |
return ["not available", 0, "-", 0] | |
def main(args): | |
""" Args function to run liftover from hg19 to hg38 """ | |
bed = os.path.abspath(args.bed) | |
genome = args.genome | |
output_dir = os.path.abspath(args.output_dir) | |
if genome == "hg38": | |
lo = LiftOver('hg19', 'hg38') | |
else: | |
lo = LiftOver('hg38', 'hg19') | |
df = pd.read_csv(bed, header=None, sep="\t") | |
# Create new column with the region translated | |
df["region_notation"] = df.mapply( | |
lambda row: get_coordinate(row, lo), axis=1 | |
) | |
# Split the "region_notation" content into its individual columns | |
df[f"{genome}_chr"] = df.apply( | |
lambda row: row["region_notation"][0], axis=1 | |
) | |
df[f"{genome}_start"] = df.apply( | |
lambda row: row["region_notation"][1], axis=1 | |
) | |
df[f"{genome}_end"] = df.apply( | |
lambda row: row["region_notation"][3], axis=1 | |
) | |
# filter the rows that wasn't translated and that start is greater than end | |
coord_df = df[~df[f"{genome}_chr"].str.contains("not")] | |
filtered_df = coord_df[coord_df[f"{genome}_start"]<coord_df[f"{genome}end"]] | |
filtered_df = filtered_df.sort_values([f"{genome}_chr", f"{genome}_start", f"{genome}_end"]) | |
del df, coord_df | |
# Create new bed file with converted genomic coordinates | |
filtered_df.to_csv( | |
bed.replace(".bed", f"_{genome}.bed"), | |
sep="\t", | |
index=False, | |
header=False, | |
columns=[f"{genome}_chr", f"{genome}_start", f"{genome}_end"] | |
) | |
if __name__ == "__main__": | |
""" Arguments for the main function """ | |
parser = argparse.ArgumentParser( | |
description="Script to liftover BED", usage='''Usage: | |
python liftover_bed.py -b file.bed | |
''') | |
parser.add_argument("-b", "--bed", help="BED file", type=str, required=True) | |
parser.add_argument("-o", "--output_dir", help="Path to output directory", type=str) | |
parser.add_argument("-g", "--genome", help="Genome you want to translate to (default: hg38 [hg19 > hg38])", type=str, required=False, default="hg38") | |
parser.set_defaults(func=main) | |
if len(sys.argv) == 1: | |
parser.print_help() | |
sys.exit(1) | |
args = parser.parse_args() | |
args.func(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment