lukauskas · September 3, 2020 15:17
diff --git a/transcript_bed_to_tss_bed.py b/transcript_bed_to_tss_bed.py
 import pandas as pd

 def transcript_to_tss(transcript_bed, tss_bed):
    """
        Convert transcript bed file to TSS bed file
        Strand aware
    """

    # Read bedfile to pandas
    df = pd.read_csv(transcript_bed,
        sep='\t',
        names=['chrom', 'start', 'end', 'name', 'score', 'strand'])

    # Tss will store the start coordinate for TSS
    df['tss'] = None

    df.loc[df['strand'] == '+', 'tss'] = df.loc[df['strand'] == '+', 'start']

    # Negative strand transcripts start at position -1 from end
    # as end coordinate in bed is not inclusive
    df.loc[df['strand'] == '-', 'tss'] = df.loc[df['strand'] == '-', 'end'] - 1

    # Tss is the new start
    df['start'] = df['tss'].astype(int)

    # Regardless of direction, "end" of TSS is start +1 as length=1
    df['end'] = df['start'] + 1

    # Re-sort and save to bed file again.
    df = df[['chrom', 'start', 'end', 'name', 'score', 'strand']]
    df = df.sort_values(by=['chrom', 'start', 'end'])
    df.to_csv(tss_bed, sep='\t', header=False, index=False)


 if __name__ == '__main__':
    transcript_to_tss('transcripts.bed', 'tss.bed')
diff --git a/transcripts.bed b/transcripts.bed
 chr5	126423409	126494364	PlusStrand	1	+
 chr5	126531200	126595219	MinusStrand	1	-
diff --git a/tss.bed b/tss.bed
 chr5	126423409	126423410	PlusStrand	1	+
 chr5	126595218	126595219	MinusStrand	1	-
	import pandas as pd

	def transcript_to_tss(transcript_bed, tss_bed):
	"""
	Convert transcript bed file to TSS bed file
	Strand aware
	"""

	# Read bedfile to pandas
	df = pd.read_csv(transcript_bed,
	sep='\t',
	names=['chrom', 'start', 'end', 'name', 'score', 'strand'])

	# Tss will store the start coordinate for TSS
	df['tss'] = None

	df.loc[df['strand'] == '+', 'tss'] = df.loc[df['strand'] == '+', 'start']

	# Negative strand transcripts start at position -1 from end
	# as end coordinate in bed is not inclusive
	df.loc[df['strand'] == '-', 'tss'] = df.loc[df['strand'] == '-', 'end'] - 1

	# Tss is the new start
	df['start'] = df['tss'].astype(int)

	# Regardless of direction, "end" of TSS is start +1 as length=1
	df['end'] = df['start'] + 1

	# Re-sort and save to bed file again.
	df = df[['chrom', 'start', 'end', 'name', 'score', 'strand']]
	df = df.sort_values(by=['chrom', 'start', 'end'])
	df.to_csv(tss_bed, sep='\t', header=False, index=False)


	if __name__ == '__main__':
	transcript_to_tss('transcripts.bed', 'tss.bed')
	chr5 126423409 126494364 PlusStrand 1 +
	chr5 126531200 126595219 MinusStrand 1 -
	chr5 126423409 126423410 PlusStrand 1 +
	chr5 126595218 126595219 MinusStrand 1 -