Last active
August 14, 2020 18:07
-
-
Save wbgalvao/924b6aad83af0e59fa7d690644555a42 to your computer and use it in GitHub Desktop.
Transform .bed files genomic coordinates from hg38 to hg19
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import pandas as pd | |
from pyliftover import LiftOver | |
target = sys.argv[1] | |
lo = LiftOver("hg38", "hg19") | |
unconverted_regions = open( | |
".".join( | |
[ | |
os.path.basename(os.path.splitext(target)[0]), | |
"unconverted_regions", | |
"bed", | |
] | |
), | |
"w", | |
) | |
def get_hg19_coordinate(row): | |
chr = row["chromosome"] | |
start = row["start"] | |
end = row["end"] | |
try: | |
return lo.convert_coordinate(chr, start, end)[0] | |
except: | |
unconverted_regions.write( | |
"{}\t{}\t{}\n".format(chr, str(start), str(end)) | |
) | |
return None | |
def get_region_notation(row): | |
notation = row["notation"] | |
region_notation = "|".join(notation.split("|")[:4]) | |
return region_notation | |
def add_region_count(row): | |
notation = row["notation"] | |
count = row["region_count"] | |
return "{}|{}".format(count, notation) | |
hg38 = pd.read_csv( | |
target, | |
sep="\t", | |
header=None, | |
names=["chromosome", "start", "end", "notation"], | |
skiprows=2, | |
) | |
# Create new column in dataframe for identifying coordinates in the same gene | |
hg38["region_notation"] = hg38.apply( | |
lambda row: get_region_notation(row), axis=1 | |
) | |
# Create new column in dataframe for enumerating coordinates in the same gene | |
hg38["region_count"] = hg38.groupby(["region_notation"]).cumcount() | |
# Add numeration to the "notation" column | |
hg38["notation"] = hg38.apply(lambda row: add_region_count(row), axis=1) | |
# Use LiftOver to convert current genomic coordinates - store results | |
# in the "converted_coordinates" column | |
hg38["converted_coordinates"] = hg38.apply( | |
lambda row: get_hg19_coordinate(row), axis=1 | |
) | |
# Drop rows where the conversion could not be made | |
hg38 = hg38[hg38.converted_coordinates.notnull()] | |
# Split the "converted_coordinates" content into its individual columns | |
hg38["hg19_chromosome"] = hg38.apply( | |
lambda row: row["converted_coordinates"][0], axis=1 | |
) | |
hg38["hg19_start"] = hg38.apply( | |
lambda row: row["converted_coordinates"][1], axis=1 | |
) | |
hg38["hg19_end"] = hg38.apply( | |
lambda row: row["converted_coordinates"][3], axis=1 | |
) | |
# Create new bed file with converted genomic coordinates | |
hg38.to_csv( | |
".".join([os.path.basename(os.path.splitext(target)[0]), "hg19", "bed"]), | |
sep="\t", | |
index=False, | |
header=False, | |
columns=["hg19_chromosome", "hg19_start", "hg19_end", "notation"], | |
) | |
unconverted_regions.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment