Last active
March 4, 2018 08:37
-
-
Save mbk0asis/bdac469845aa5e3fe321a8f76b834711 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To caluculate CpG density and distribution in repeat elements. (ERVs, LINE, and etc.) | |
## extract information of repeats from repeatMasker database | |
$ zcat hg38.repeat.masker.txt.gz | head | |
#bin swScore milliDiv milliDel milliIns genoName genoStart genoEnd genoLeft strand repName repClass repFamily repStart repEndrepLeft id | |
0 1892 83 59 14 chr1 67108753 67109046 -181847376 + L1P5 LINE L1 5301 5607 -544 1 | |
1 2582 27 0 23 chr1 8388315 8388618 -240567804 - AluY SINE Alu -15 296 1 1 | |
1 4085 171 77 36 chr1 25165803 25166380 -223790042 + L1MB5 LINE L1 5567 6174 0 4 | |
1 2285 91 0 13 chr1 33554185 33554483 -215401939 - AluSc SINE Alu -6 303 10 6 | |
1 2451 64 3 26 chr1 41942894 41943205 -207013217 - AluY SINE Alu -7 304 1 8 | |
1 1587 272 100 49 chr1 50331336 50332274 -198624148 + HAL1 LINE L1 773 1763 -744 9 | |
1 1393 280 82 51 chr1 58719764 58720546 -190235876 + L2a LINE L2 2582 3418 -8 1 | |
2 5372 165 14 27 chr1 75496057 75497775 -173458647 + L1MA9 LINE L1 5168 6868 -30 1 | |
2 536 349 146 56 chr1 92274205 92275925 -156680497 + L2 LINE L2 406 2306 -1113 1 | |
$ zcat hg38.repeat.masker.txt.gz | \ | |
awk 'OFS="\t" {if($13 ~ /^ERVL$/ && $8-$7 >= 5000) print $6,$7,$8,0,$11,$10,$12,$13 }' \ | |
> ERVL.5kb.bed | |
$ head ERVL.5kb.bed | |
chr1 175242966 175248539 0 HERVL-int + LTR ERVL | |
chr1 37365527 37371193 0 HERVL-int + LTR ERVL | |
chr1 38144849 38150489 0 HERVL-int + LTR ERVL | |
chr1 49573231 49578830 0 HERVL-int + LTR ERVL | |
chr1 64186957 64192253 0 HERVL18-int + LTR ERVL | |
# Get gDNA sequences of each repeat. | |
# '-s' to obtain stranded sequnces | |
$ bedtools getfasta -s -fi genome.fa -bed ERVL.5kb.bed -fo ERVL.5kb.fasta | |
# split the fasta into single individual sequences | |
$ mkdir tmp | |
$ faSplit byname ERVL.fasta tmp/ | |
$ cd tmp | |
$ for f in *.fa; do | |
fasta_formatter -i $f -o $f.fasta; | |
done | |
# calculate relative positions of 'CpG' | |
$ for f in *.fasta ; | |
do name=$(grep ">" $f) ; | |
pos=$(grep -v ">" $f | grep -bo cg | cut -d: -f1) ; | |
len=$(grep -v ">" $f | wc -c) ; | |
echo $name" "$len" "$pos ; | |
done | awk -F" " '{printf $1;for(i=3;i<=NF;i++){printf ","$i/$2};printf "\n"}' > ../result.csv | |
$ cd .. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment