Last active
April 23, 2016 13:07
-
-
Save aseetharam/f3699e88e53781185dc9d5738a95c1e1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## find the spacers | |
| grep -one "GTGTTCCCCGCGCCAGCGGGGATAAACC.\{32\}" example.list | \ | |
| sed 's/:GTGTTCCCCGCGCCAGCGGGGATAAACC/\t/g' | \ | |
| awk '{print ">"$1"\n"$2}' > example_spacers.fa | |
| # find the crispr sequences, extract the 32 bases spacers adjacent to it and print them as fasta sequence | |
| # multiple spacers from the same sequence are printed with the same sequence id | |
| # to count: | |
| grep -c ">" example_spacers.fa | |
| # will give you total number of spacers | |
| grep ">" example_spacers.fa |sort |uniq |wc -l | |
| # will give you number of lines having spacer spequences (from the original input file) | |
| ## determine the position in the genome | |
| # this requires you to install gmap-gsnap program | |
| # you can download it from here | |
| # http://research-pub.gene.com/gmap/src/gmap-gsnap-2016-04-04.tar.gz | |
| # if you're on Linghtning3, you can just load it as module | |
| # index your reference sequence (I've' included plasmid 1, 2 and ecoli genome-GCF_000005845.2_ASM584v2_genomic.fna in reference.fna) | |
| gmap_build -d tmt1 -D $(pwd) refernece.fna | |
| # map the sequences on to the genome (we will request to print it as psl format) | |
| gmap -D $(pwd) -d tmt1 -t 1 -B 5 -f psl example_spacers.fa > example_blat.psl | |
| # psl formats descritions can be found here http://www.ensembl.org/info/website/upload/psl.html | |
| # you can filter the output and print only the parts you need | |
| awk '$1=="32"' example_blat.psl |cut -f 10,14 > example_blat_filtered.txt | |
| # here you are filtering all the matches that have alignment length of 32 bases long and - | |
| # asking it to print spacer id and reference id only | |
| # alternatively, you can also do something like this: | |
| awk '$1=="32"' example_blat.psl |cut -f 14 |sort |uniq -c | |
| # will give the stats for how many spacers matched to what reference sequences | |
| # for eg. above example file gives me | |
| 6185 NC_000913.3 | |
| 4740 plasmid1 | |
| 280 plasmid2 | |
| # you can make it strict alignment if you impose alignment length, query length, target length to be exact 32 - | |
| # and donn't allow any mis-matches |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment