Last active
May 16, 2018 12:18
-
-
Save Tabea-K/59df74d6611cf56250798ef65d8db6be to your computer and use it in GitHub Desktop.
This script is used to conver a "gene" UCSC file into BED12 file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """ | |
| A script that converts a UCSC gene format into BED12 format | |
| Author: Tabea Kischka | |
| Example for UCSC gene format: | |
| 594 NM_058167 chr1 - 1253911 1273854 1255202 1267992 7 1253911,1256044,1256991,1257207,1263345,1267861,1273665, 1255487,1256125,1257130,1257310,1263386,1267992,1273854, 0UBE2J2 cmpl cmpl 0,0,2,1,2,0,-1, | |
| """ | |
| import sys | |
| import csv | |
| import operator | |
| def longblob2list(longblob): | |
| """ | |
| Converts a longblob variable into a list of integers. | |
| "90930917,90931703,90932054," > [90930917,90931703,90932054] | |
| """ | |
| if not longblob: return [] | |
| return [int(number) for number in longblob.split(',') if number != "" and number != ","] | |
| ucsc_table_fieldnames = ["bin", "name", "chrom", "strand", | |
| "txStart", "txEnd", "cdsStart", "cdsEnd", | |
| "exonCount", "exonStarts", | |
| "exonEnds", "score", "name2", | |
| "cdsStartStat", "cdsEndStat", "exonFrames"] | |
| csvfile = open(sys.argv[1], 'r') | |
| reader = csv.DictReader(csvfile, fieldnames=ucsc_table_fieldnames, dialect='excel-tab') | |
| for row_nr, row in enumerate(reader): | |
| row['exonEnds'] = longblob2list(row['exonEnds']) | |
| row['exonStarts'] = longblob2list(row['exonStarts']) | |
| row['txStart'] = int(row['txStart']) | |
| p = map(str, [row['chrom'], | |
| row['txStart'], | |
| row['txEnd'], | |
| row['name'], | |
| 0, | |
| row['strand'], | |
| row['cdsStart'], | |
| row['cdsEnd'], | |
| "255,0,255", | |
| row['exonCount'], | |
| ",".join(map(str, map(operator.sub, row['exonEnds'], row['exonStarts']))), | |
| ",".join(map(str, map(operator.sub, row['exonStarts'], [row['txStart']] * len(row['exonStarts'])))) | |
| ]) | |
| print("\t".join(p)) | |
| csvfile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment