aseetharam · April 23, 2016 13:07
diff --git a/spacers_2.sh b/spacers_2.sh
 ## find the spacers
 grep -one "GTGTTCCCCGCGCCAGCGGGGATAAACC.\{32\}" example.list | \
 sed 's/:GTGTTCCCCGCGCCAGCGGGGATAAACC/\t/g'  | \
 awk '{print ">"$1"\n"$2}' > example_spacers.fa
 # find the crispr sequences, extract the 32 bases spacers adjacent to it and print them as fasta sequence
 # multiple spacers from the same sequence are printed with the same sequence id
 # to count:
 grep -c ">" example_spacers.fa
 # will give you total number of spacers
 grep ">" example_spacers.fa |sort |uniq |wc -l
 # will give you number of lines having spacer spequences (from the original input file)

 ## determine the position in the genome
 # this requires you to install gmap-gsnap program 
 # you can download it from here
 # http://research-pub.gene.com/gmap/src/gmap-gsnap-2016-04-04.tar.gz
 # if you're on Linghtning3, you can just load it as module

 # index your reference sequence (I've' included plasmid 1, 2 and ecoli genome-GCF_000005845.2_ASM584v2_genomic.fna in reference.fna)
 gmap_build -d tmt1 -D $(pwd) refernece.fna
 # map the sequences on to the genome (we will request to print it as psl format)
 gmap -D $(pwd) -d tmt1 -t 1 -B 5 -f psl example_spacers.fa > example_blat.psl
 # psl formats descritions can be found here http://www.ensembl.org/info/website/upload/psl.html

 # you can filter the output and print only the parts you need
 awk '$1=="32"' example_blat.psl |cut -f 10,14 > example_blat_filtered.txt
 # here you are filtering all the matches that have alignment length of 32 bases long and -
 # asking it to print spacer id and reference id only
 # alternatively, you can also do something like this:
 awk '$1=="32"' example_blat.psl |cut -f 14 |sort |uniq -c
 # will give the stats for how many spacers matched to what reference sequences 
 # for eg. above example file gives me
   6185 NC_000913.3
   4740 plasmid1
    280 plasmid2
 # you can make it strict alignment if you impose alignment length, query length, target length to be exact 32 -
 # and donn't allow any mis-matches
	## find the spacers
	grep -one "GTGTTCCCCGCGCCAGCGGGGATAAACC.\{32\}" example.list \| \
	sed 's/:GTGTTCCCCGCGCCAGCGGGGATAAACC/\t/g' \| \
	awk '{print ">"$1"\n"$2}' > example_spacers.fa
	# find the crispr sequences, extract the 32 bases spacers adjacent to it and print them as fasta sequence
	# multiple spacers from the same sequence are printed with the same sequence id
	# to count:
	grep -c ">" example_spacers.fa
	# will give you total number of spacers
	grep ">" example_spacers.fa \|sort \|uniq \|wc -l
	# will give you number of lines having spacer spequences (from the original input file)

	## determine the position in the genome
	# this requires you to install gmap-gsnap program
	# you can download it from here
	# http://research-pub.gene.com/gmap/src/gmap-gsnap-2016-04-04.tar.gz
	# if you're on Linghtning3, you can just load it as module

	# index your reference sequence (I've' included plasmid 1, 2 and ecoli genome-GCF_000005845.2_ASM584v2_genomic.fna in reference.fna)
	gmap_build -d tmt1 -D $(pwd) refernece.fna
	# map the sequences on to the genome (we will request to print it as psl format)
	gmap -D $(pwd) -d tmt1 -t 1 -B 5 -f psl example_spacers.fa > example_blat.psl
	# psl formats descritions can be found here http://www.ensembl.org/info/website/upload/psl.html

	# you can filter the output and print only the parts you need
	awk '$1=="32"' example_blat.psl \|cut -f 10,14 > example_blat_filtered.txt
	# here you are filtering all the matches that have alignment length of 32 bases long and -
	# asking it to print spacer id and reference id only
	# alternatively, you can also do something like this:
	awk '$1=="32"' example_blat.psl \|cut -f 14 \|sort \|uniq -c
	# will give the stats for how many spacers matched to what reference sequences
	# for eg. above example file gives me
	6185 NC_000913.3
	4740 plasmid1
	280 plasmid2
	# you can make it strict alignment if you impose alignment length, query length, target length to be exact 32 -
	# and donn't allow any mis-matches