mbk0asis · March 4, 2018 08:37
diff --git a/CpG density calculator - bash b/CpG density calculator - bash
 # To caluculate CpG density and distribution in repeat elements. (ERVs, LINE, and etc.)

 ## extract information of repeats from repeatMasker database
 $ zcat hg38.repeat.masker.txt.gz | head
 #bin	swScore	milliDiv	milliDel	milliIns	genoName	genoStart	genoEnd	genoLeft	strand	repName	repClass	repFamily	repStart	repEndrepLeft	id
 0	1892	83	59	14	chr1	67108753	67109046	-181847376	+	L1P5	LINE	L1	5301	5607	-544	1
 1	2582	27	0	23	chr1	8388315	8388618	-240567804	-	AluY	SINE	Alu	-15	296	1	1
 1	4085	171	77	36	chr1	25165803	25166380	-223790042	+	L1MB5	LINE	L1	5567	6174	0	4
 1	2285	91	0	13	chr1	33554185	33554483	-215401939	-	AluSc	SINE	Alu	-6	303	10	6
 1	2451	64	3	26	chr1	41942894	41943205	-207013217	-	AluY	SINE	Alu	-7	304	1	8
 1	1587	272	100	49	chr1	50331336	50332274	-198624148	+	HAL1	LINE	L1	773	1763	-744	9
 1	1393	280	82	51	chr1	58719764	58720546	-190235876	+	L2a	LINE	L2	2582	3418	-8	1
 2	5372	165	14	27	chr1	75496057	75497775	-173458647	+	L1MA9	LINE	L1	5168	6868	-30	1
 2	536	349	146	56	chr1	92274205	92275925	-156680497	+	L2	LINE	L2	406	2306	-1113	1
 $ zcat hg38.repeat.masker.txt.gz | \
 awk 'OFS="\t" {if($13 ~ /^ERVL$/ && $8-$7 >= 5000) print $6,$7,$8,0,$11,$10,$12,$13 }' \
 > ERVL.5kb.bed
 $ head ERVL.5kb.bed
 chr1	175242966	175248539	0	HERVL-int	+	LTR	ERVL
 chr1	37365527	37371193	0	HERVL-int	+	LTR	ERVL
 chr1	38144849	38150489	0	HERVL-int	+	LTR	ERVL
 chr1	49573231	49578830	0	HERVL-int	+	LTR	ERVL
 chr1	64186957	64192253	0	HERVL18-int	+	LTR	ERVL


 # Get gDNA sequences of each repeat. 
 # '-s' to obtain stranded sequnces
 $ bedtools getfasta -s -fi genome.fa -bed ERVL.5kb.bed -fo ERVL.5kb.fasta 

 # split the fasta into single individual sequences
 $ mkdir tmp
 $ faSplit byname ERVL.fasta tmp/
 $ cd tmp

 $ for f in *.fa; do 
 fasta_formatter -i $f -o $f.fasta;
 done


 # calculate relative positions of 'CpG'
 $ for f in *.fasta ; 
  do name=$(grep ">" $f) ; 
  pos=$(grep -v ">" $f | grep -bo cg | cut -d: -f1) ; 
  len=$(grep -v ">" $f | wc -c) ;
  echo $name" "$len" "$pos ; 
 done | awk -F" " '{printf $1;for(i=3;i<=NF;i++){printf ","$i/$2};printf "\n"}' > ../result.csv
 $ cd ..
	# To caluculate CpG density and distribution in repeat elements. (ERVs, LINE, and etc.)

	## extract information of repeats from repeatMasker database
	$ zcat hg38.repeat.masker.txt.gz \| head
	#bin swScore milliDiv milliDel milliIns genoName genoStart genoEnd genoLeft strand repName repClass repFamily repStart repEndrepLeft id
	0 1892 83 59 14 chr1 67108753 67109046 -181847376 + L1P5 LINE L1 5301 5607 -544 1
	1 2582 27 0 23 chr1 8388315 8388618 -240567804 - AluY SINE Alu -15 296 1 1
	1 4085 171 77 36 chr1 25165803 25166380 -223790042 + L1MB5 LINE L1 5567 6174 0 4
	1 2285 91 0 13 chr1 33554185 33554483 -215401939 - AluSc SINE Alu -6 303 10 6
	1 2451 64 3 26 chr1 41942894 41943205 -207013217 - AluY SINE Alu -7 304 1 8
	1 1587 272 100 49 chr1 50331336 50332274 -198624148 + HAL1 LINE L1 773 1763 -744 9
	1 1393 280 82 51 chr1 58719764 58720546 -190235876 + L2a LINE L2 2582 3418 -8 1
	2 5372 165 14 27 chr1 75496057 75497775 -173458647 + L1MA9 LINE L1 5168 6868 -30 1
	2 536 349 146 56 chr1 92274205 92275925 -156680497 + L2 LINE L2 406 2306 -1113 1
	$ zcat hg38.repeat.masker.txt.gz \| \
	awk 'OFS="\t" {if($13 ~ /^ERVL$/ && $8-$7 >= 5000) print $6,$7,$8,0,$11,$10,$12,$13 }' \
	> ERVL.5kb.bed
	$ head ERVL.5kb.bed
	chr1 175242966 175248539 0 HERVL-int + LTR ERVL
	chr1 37365527 37371193 0 HERVL-int + LTR ERVL
	chr1 38144849 38150489 0 HERVL-int + LTR ERVL
	chr1 49573231 49578830 0 HERVL-int + LTR ERVL
	chr1 64186957 64192253 0 HERVL18-int + LTR ERVL


	# Get gDNA sequences of each repeat.
	# '-s' to obtain stranded sequnces
	$ bedtools getfasta -s -fi genome.fa -bed ERVL.5kb.bed -fo ERVL.5kb.fasta

	# split the fasta into single individual sequences
	$ mkdir tmp
	$ faSplit byname ERVL.fasta tmp/
	$ cd tmp

	$ for f in *.fa; do
	fasta_formatter -i $f -o $f.fasta;
	done


	# calculate relative positions of 'CpG'
	$ for f in *.fasta ;
	do name=$(grep ">" $f) ;
	pos=$(grep -v ">" $f \| grep -bo cg \| cut -d: -f1) ;
	len=$(grep -v ">" $f \| wc -c) ;
	echo $name" "$len" "$pos ;
	done \| awk -F" " '{printf $1;for(i=3;i<=NF;i++){printf ","$i/$2};printf "\n"}' > ../result.csv
	$ cd ..