Last active
February 5, 2016 06:29
-
-
Save mbk0asis/6519e4cf098e7a47a1b5 to your computer and use it in GitHub Desktop.
creating mutant genomes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# change chr order vcf using 'VCF-TOOLS' | |
# to check and change order of vcf | |
$ cat variants.vcf | cut -f 1 | uniq | |
$ vcf-sort -c variants.vcf > variants_sorted.vcf | |
$ cat variants.vcf | cut -f 1 | uniq | |
# download chr fasta files | |
# concatenate chr fasta files in the chr order | |
# make a list of fasta files in the chr order | |
$ nano chrOrder | |
chr1.fa | |
. | |
. | |
chr10.fa | |
. | |
. | |
chr20.fa | |
. | |
chrMT.fa | |
chrX.fa | |
chrY.fa | |
$ cat chrOrder | while read line ; do cat $line >> genome.fa ; done | |
# genome.fa in the chr order has been created | |
# create a dictionary file | |
$ picard-tools CreateSequenceDictionary R= genome.fa O= genome.dict | |
or | |
$ java -jar CreateSequenceDictionary.jar R= genome.fa O= genome.dict | |
# create genome idex | |
$ samtools faidx genome.fa | |
# GATK FastaAlternateReferenceMaker | |
$ java -jar GenomeAnalysisTK.jar | |
-T FastaAlternateReferenceMaker | |
-R reference.fasta | |
-o output.fasta | |
-L input.intervals | |
-V input.vcf | |
[--snpmask mask.vcf] | |
# extract mRNA's in the mutant genome | |
# get an appropriate gff(gtf) for the genome | |
# convert to bed12 using 'gtf2bed12.sh' | |
# GTF input | |
1 pseudogene gene 11869 14412 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; | |
1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; | |
1 processed_transcript exon 11869 12227 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "1"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; exon_id "ENSE00002234944"; | |
# to extract sequences of transcripts of interest (TOI) | |
$ cat TOI_ID_list | while read line; do grep $line GTF; done > TOI.gtf | |
$ gtf2bed12.sh | |
# or remove unnecessary data (i.e. ncRNA, pseudogenes, and etc.) | |
$ awk '$2 ~ "processed_transcript"' | less | |
1 processed_transcript transcript 11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; | |
1 processed_transcript exon 11869 12227 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "1"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; exon_id "ENSE00002234944"; | |
1 processed_transcript exon 12613 12721 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "2"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; exon_id "ENSE00003582793"; | |
1 processed_transcript exon 13221 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "3"; gene_name "DDX11L1"; gene_source "ensembl_havana"; gene_biotype "pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana"; exon_id "ENSE00002312635"; | |
1 processed_transcript gene 141474 173862 . - . gene_id "ENSG00000241860"; gene_name "RP11-34P13.13"; gene_source "havana"; gene_biotype "processed_transcript"; | |
### gtf2bed12.sh ################################### | |
# 'gtfToGenePred' was downloaded form : "http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred" | |
chmod +x gtfToGenePred | |
./gtfToGenePred -genePredExt -geneNameAsName2 genes.gtf genes.tmp | |
awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$6"\t"$7"\t0\t"$8"\t"$9"\t"$10}' genes.tmp > genes.bed12 | |
rm genes.tmp | |
#################################################### | |
# BED12 result | |
chr txStart txEnd id score strand cdsStart cdsEnd exonCounts exonStarts exonEnds | |
1 11868 14409 ENST00000456328 0 + 14409 14409 0 3 11868,12612,13220, 12227,12721,14409, | |
1 11871 14412 ENST00000515242 0 + 14412 14412 0 3 11871,12612,13224, 12227,12721,14412, | |
# 'bedtools getfasta' using the BED12 result file to extract mRNA sequences | |
$ bedtools getfasta -fi chr1.fa -bed genes.bed12 -split -name -fo stdout | fold -w 60 > hg_cancer_mRNA | |
$ hg_cancer_mRNA | |
> transcript1 | |
TGCTGGGCTCTGTGGGGGTGGGATCCACTGAGCAAGAGCACTTGGTTCCCTGGCTTCAGC | |
CCCCATTCCAGGGGAGTGAATGGTTCTGTCTCGCTAGGGTTCCAGGTGACACTGGGGTAT | |
GAAAAAAAACTCCTGCAGCTAGCTCAGCATCTGCCTAAACAGCAGCCCAATTTTGTGCTT | |
GAAACTCAGGGCCCTTGTGGTGTAGGTACCCGAGGGAATCTCCTGGTCTGTGGGTTGTGA | |
> transcript2 | |
AGACTGTGTGAAAAGCATAGTATCTGGGCCGAATAGCACTGTCCCTCACATCACGGTCCC | |
TCACGACTTCCCTTGGCTAGAGGAGGGAGTTCCTGACCCCTTGTGCTTTCCAGTGAGGTG | |
ATGCCCCACCCTGCTTCTGCTTGCCTTCTGTGGGCTGCACCTGCTGTCTAATCAGTCCCA | |
GTGAGATGAACTGGGTACCTCAGTTGGAAATGCAGAAATCATCCACCTTCTGCATTGGTC | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment