Skip to content

Instantly share code, notes, and snippets.

@mbk0asis
Last active March 4, 2018 08:37
Show Gist options
  • Save mbk0asis/bdac469845aa5e3fe321a8f76b834711 to your computer and use it in GitHub Desktop.
Save mbk0asis/bdac469845aa5e3fe321a8f76b834711 to your computer and use it in GitHub Desktop.
# To caluculate CpG density and distribution in repeat elements. (ERVs, LINE, and etc.)
## extract information of repeats from repeatMasker database
$ zcat hg38.repeat.masker.txt.gz | head
#bin swScore milliDiv milliDel milliIns genoName genoStart genoEnd genoLeft strand repName repClass repFamily repStart repEndrepLeft id
0 1892 83 59 14 chr1 67108753 67109046 -181847376 + L1P5 LINE L1 5301 5607 -544 1
1 2582 27 0 23 chr1 8388315 8388618 -240567804 - AluY SINE Alu -15 296 1 1
1 4085 171 77 36 chr1 25165803 25166380 -223790042 + L1MB5 LINE L1 5567 6174 0 4
1 2285 91 0 13 chr1 33554185 33554483 -215401939 - AluSc SINE Alu -6 303 10 6
1 2451 64 3 26 chr1 41942894 41943205 -207013217 - AluY SINE Alu -7 304 1 8
1 1587 272 100 49 chr1 50331336 50332274 -198624148 + HAL1 LINE L1 773 1763 -744 9
1 1393 280 82 51 chr1 58719764 58720546 -190235876 + L2a LINE L2 2582 3418 -8 1
2 5372 165 14 27 chr1 75496057 75497775 -173458647 + L1MA9 LINE L1 5168 6868 -30 1
2 536 349 146 56 chr1 92274205 92275925 -156680497 + L2 LINE L2 406 2306 -1113 1
$ zcat hg38.repeat.masker.txt.gz | \
awk 'OFS="\t" {if($13 ~ /^ERVL$/ && $8-$7 >= 5000) print $6,$7,$8,0,$11,$10,$12,$13 }' \
> ERVL.5kb.bed
$ head ERVL.5kb.bed
chr1 175242966 175248539 0 HERVL-int + LTR ERVL
chr1 37365527 37371193 0 HERVL-int + LTR ERVL
chr1 38144849 38150489 0 HERVL-int + LTR ERVL
chr1 49573231 49578830 0 HERVL-int + LTR ERVL
chr1 64186957 64192253 0 HERVL18-int + LTR ERVL
# Get gDNA sequences of each repeat.
# '-s' to obtain stranded sequnces
$ bedtools getfasta -s -fi genome.fa -bed ERVL.5kb.bed -fo ERVL.5kb.fasta
# split the fasta into single individual sequences
$ mkdir tmp
$ faSplit byname ERVL.fasta tmp/
$ cd tmp
$ for f in *.fa; do
fasta_formatter -i $f -o $f.fasta;
done
# calculate relative positions of 'CpG'
$ for f in *.fasta ;
do name=$(grep ">" $f) ;
pos=$(grep -v ">" $f | grep -bo cg | cut -d: -f1) ;
len=$(grep -v ">" $f | wc -c) ;
echo $name" "$len" "$pos ;
done | awk -F" " '{printf $1;for(i=3;i<=NF;i++){printf ","$i/$2};printf "\n"}' > ../result.csv
$ cd ..
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment