arq5x · March 10, 2011 01:03
diff --git a/transcripts-w-groupBy.sh b/transcripts-w-groupBy.sh
 # Step 1: Get transcripts from UCSC refGene (hg19) into a BED file.
 # Notes:
 #      the awk statement reorders the "raw" columns into BED12 format
 #      bed12ToBed6 converts the BED12 into discrete BED6 entries for each exon
 #           - the -n option is new and in the bedtools repository
 $ curl -s http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz | \
      zcat | \
      awk '{OFS="\t"; print $3,$5,$6,$2,$9,$4,$7,$8,"0",$9,$10,$11}' | \
      bed12ToBed6 -n \
      > refGene.bed

 $ head refGene.bed
 chr19	50595745	50595866	NR_024227	2	-
 chr19	50601082	50601203	NR_024227	2	-
 chr16	5464988	8197482	NM_018992	1	+
 chr16	5478429	8224364	NM_018992	2	+
 chr16	5480400	8228306	NM_018992	3	+
 chr16	5482315	8232136	NM_018992	4	+
 chr16	5484847	8237200	NM_018992	5	+
 chr16	5489792	8247090	NM_018992	6	+
 chr12	237002794	355504191	NM_019086	10	-
 chr12	237007578	355513759	NM_019086	9	-


 # Step 2: Use fastaFromBed to extract the sequence for each exon
	# Step 1: Get transcripts from UCSC refGene (hg19) into a BED file.
	# Notes:
	# the awk statement reorders the "raw" columns into BED12 format
	# bed12ToBed6 converts the BED12 into discrete BED6 entries for each exon
	# - the -n option is new and in the bedtools repository
	$ curl -s http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz \| \
	zcat \| \
	awk '{OFS="\t"; print $3,$5,$6,$2,$9,$4,$7,$8,"0",$9,$10,$11}' \| \
	bed12ToBed6 -n \
	> refGene.bed

	$ head refGene.bed
	chr19 50595745 50595866 NR_024227 2 -
	chr19 50601082 50601203 NR_024227 2 -
	chr16 5464988 8197482 NM_018992 1 +
	chr16 5478429 8224364 NM_018992 2 +
	chr16 5480400 8228306 NM_018992 3 +
	chr16 5482315 8232136 NM_018992 4 +
	chr16 5484847 8237200 NM_018992 5 +
	chr16 5489792 8247090 NM_018992 6 +
	chr12 237002794 355504191 NM_019086 10 -
	chr12 237007578 355513759 NM_019086 9 -


	# Step 2: Use fastaFromBed to extract the sequence for each exon