Skip to content

Instantly share code, notes, and snippets.

@Tabea-K
Last active May 16, 2018 12:18
Show Gist options
  • Select an option

  • Save Tabea-K/59df74d6611cf56250798ef65d8db6be to your computer and use it in GitHub Desktop.

Select an option

Save Tabea-K/59df74d6611cf56250798ef65d8db6be to your computer and use it in GitHub Desktop.
This script is used to conver a "gene" UCSC file into BED12 file
#!/usr/bin/env python
"""
A script that converts a UCSC gene format into BED12 format
Author: Tabea Kischka
Example for UCSC gene format:
594 NM_058167 chr1 - 1253911 1273854 1255202 1267992 7 1253911,1256044,1256991,1257207,1263345,1267861,1273665, 1255487,1256125,1257130,1257310,1263386,1267992,1273854, 0UBE2J2 cmpl cmpl 0,0,2,1,2,0,-1,
"""
import sys
import csv
import operator
def longblob2list(longblob):
"""
Converts a longblob variable into a list of integers.
"90930917,90931703,90932054," > [90930917,90931703,90932054]
"""
if not longblob: return []
return [int(number) for number in longblob.split(',') if number != "" and number != ","]
ucsc_table_fieldnames = ["bin", "name", "chrom", "strand",
"txStart", "txEnd", "cdsStart", "cdsEnd",
"exonCount", "exonStarts",
"exonEnds", "score", "name2",
"cdsStartStat", "cdsEndStat", "exonFrames"]
csvfile = open(sys.argv[1], 'r')
reader = csv.DictReader(csvfile, fieldnames=ucsc_table_fieldnames, dialect='excel-tab')
for row_nr, row in enumerate(reader):
row['exonEnds'] = longblob2list(row['exonEnds'])
row['exonStarts'] = longblob2list(row['exonStarts'])
row['txStart'] = int(row['txStart'])
p = map(str, [row['chrom'],
row['txStart'],
row['txEnd'],
row['name'],
0,
row['strand'],
row['cdsStart'],
row['cdsEnd'],
"255,0,255",
row['exonCount'],
",".join(map(str, map(operator.sub, row['exonEnds'], row['exonStarts']))),
",".join(map(str, map(operator.sub, row['exonStarts'], [row['txStart']] * len(row['exonStarts']))))
])
print("\t".join(p))
csvfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment