Last active
February 6, 2020 08:44
-
-
Save endrebak/9d0469021b779f62d3d26724b6d3136f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyranges_db as db | |
import pyranges as pr | |
gr = db.gencode.genes("human") # takes a while to download from ftp | |
# Wall time: 2min 1s | |
gr.to_gtf("gencode_human.gtf.gz") # takes a while to gzip and write to disk | |
# Wall time: 4min 3s | |
# subset for faster operations | |
gr = gr[["Feature", "gene_type", "gene_id", "gene_name"]] | |
gr = gr[gr.Feature == "transcript"].drop("Feature") | |
lncrna = gr[(gr.gene_type == "lncRNA")].drop("gene_type") | |
protein = gr[gr.gene_type == "protein_coding"].drop("gene_type") | |
_overlap = lncrna.join(protein, strandedness="same", suffix="_protein").drop( | |
like="Start_protein|End_protein|Strand_protein") | |
_overlap.Distance = 0 | |
no_overlap_protein = lncrna.overlap(protein, invert=True, strandedness="same") | |
_upstream = no_overlap_protein.nearest(protein, how="upstream", overlap=False, strandedness="same", suffix="_protein").drop( | |
like="Start_protein|End_protein|Strand_protein") | |
_downstream = no_overlap_protein.nearest(protein, how="downstream", overlap=False, strandedness="same", suffix="_protein").drop( | |
like="Start_protein|End_protein|Strand_protein|gene_type") | |
_downstream.Distance = -_downstream.Distance | |
outgr = pr.concat([_overlap, _downstream, _upstream]) | |
outgr | |
# +--------------+-----------+-----------+--------------+-------------------+--------------+--------------------+---------------------+------------+ | |
# | Chromosome | Start | End | Strand | gene_id | gene_name | gene_id_protein | gene_name_protein | Distance | | |
# | (category) | (int32) | (int32) | (category) | (category) | (category) | (category) | (category) | (int64) | | |
# |--------------+-----------+-----------+--------------+-------------------+--------------+--------------------+---------------------+------------| | |
# | chr1 | 1055032 | 1056116 | + | ENSG00000242590.1 | AL645608.5 | ENSG00000188157.15 | AGRN | 0 | | |
# | chr1 | 1055032 | 1056116 | + | ENSG00000242590.1 | AL645608.5 | ENSG00000188157.15 | AGRN | 0 | | |
# | chr1 | 1055032 | 1056116 | + | ENSG00000242590.1 | AL645608.5 | ENSG00000188157.15 | AGRN | 0 | | |
# | chr1 | 1055032 | 1056116 | + | ENSG00000242590.1 | AL645608.5 | ENSG00000188157.15 | AGRN | 0 | | |
# | ... | ... | ... | ... | ... | ... | ... | ... | ... | | |
# | chrY | 22296797 | 22298876 | - | ENSG00000215560.2 | TTTY5 | ENSG00000188120.14 | DAZ1 | 830479 | | |
# | chrY | 22439592 | 22441458 | - | ENSG00000131538.7 | TTTY6 | ENSG00000188120.14 | DAZ1 | 687897 | | |
# | chrY | 23936726 | 23941622 | - | ENSG00000280961.1 | TTTY3B | ENSG00000172352.5 | CDY1B | 103607 | | |
# | chrY | 24183433 | 24187231 | - | ENSG00000244231.1 | CSPG4P2Y | ENSG00000187191.14 | DAZ3 | 575838 | | |
# +--------------+-----------+-----------+--------------+-------------------+--------------+--------------------+---------------------+------------+ | |
# Stranded PyRanges object has 100,466 rows and 9 columns from 24 chromosomes. | |
# For printing, the PyRanges was sorted on Chromosome and Strand. | |
outgr.to_csv("lncrna_prot_join.csv", sep="\t", header=True) | |
# !head lncrna_prot_join.csv | |
# Chromosome Start End Strand gene_id gene_name gene_id_protein gene_name_protein Distance | |
# chr1 1055032 1056116 + ENSG00000242590.1 AL645608.5 ENSG00000188157.15 AGRN 0 | |
# chr1 1055032 1056116 + ENSG00000242590.1 AL645608.5 ENSG00000188157.15 AGRN 0 | |
# chr1 1055032 1056116 + ENSG00000242590.1 AL645608.5 ENSG00000188157.15 AGRN 0 | |
# chr1 1055032 1056116 + ENSG00000242590.1 AL645608.5 ENSG00000188157.15 AGRN 0 | |
# chr1 1055032 1056116 + ENSG00000242590.1 AL645608.5 ENSG00000188157.15 AGRN 0 | |
# chr1 7368941 7370270 + ENSG00000237402.1 CAMTA1-IT1 ENSG00000171735.19 CAMTA1 0 | |
# chr1 7776382 7776775 + ENSG00000269925.1 Z98884.2 ENSG00000049245.13 VAMP3 0 | |
# chr1 7776382 7776775 + ENSG00000269925.1 Z98884.2 ENSG00000049245.13 VAMP3 0 | |
# chr1 9942922 9949974 + ENSG00000228150.1 AL357140.2 ENSG00000173614.14 NMNAT1 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment