Skip to content

Instantly share code, notes, and snippets.

View mbk0asis's full-sized avatar

Byungkuk Min mbk0asis

  • Korea Research Institute of Bioscience and Biotechnology (KRIBB)
  • Daejeon, S.Korea
  • 21:45 (UTC +09:00)
View GitHub Profile
@mbk0asis
mbk0asis / plotDE.py
Last active March 19, 2018 00:09
extract count data from stringtie output - copied from <http://www.ccb.jhu.edu/software/stringtie/dl/prepDE.py>
#!/usr/bin/env python2
import re, csv, sys, os, glob, warnings, itertools
from math import ceil
from optparse import OptionParser
from operator import itemgetter
MIN_PYTHON = (2, 7)
if sys.version_info < MIN_PYTHON:
sys.exit("Python %s.%s or later is required.\n" % MIN_PYTHON)
@mbk0asis
mbk0asis / HISAT2-STRINGTIE-BALLGOWN
Created March 19, 2018 00:07
HISAT2-STRINGTIE-BALLGOWN pipeline
# For RNA-seq reads, use "--dta/--downstream-transcriptome-assembly"
# the command is similar to 'bowtie2'
$ hisat2 -x genome -1 read1.fq.gz -2 read2.fq.gz -S Sample.sam -p 8 --dta 2>&1 | tee $l.stat
$ samtools view -bS Sample.sam | samtools sort -@ 8 - Sample.sorted
# The bam files can be fed into other pipelines e.g DESeq, edgeR, and etc
setwd("/home/bk/Desktop/hg38")
dta <- read.csv("L1.6kb.CpG.rel.pos.csv",header = F, sep = ",")
dim(dta)
dta.t <- t(dta[,-1])
dta.t <- dta.t[,colSums(is.na(dta.t))<nrow(dta.t)]
nc <- ncol(dta.t)
res <- lapply(1:nc,function(i) {
h<-hist(dta.t[,i], plot=F, breaks = 10)
# To caluculate CpG density and distribution in repeat elements. (ERVs, LINE, and etc.)
## extract information of repeats from repeatMasker database
$ zcat hg38.repeat.masker.txt.gz | head
#bin swScore milliDiv milliDel milliIns genoName genoStart genoEnd genoLeft strand repName repClass repFamily repStart repEndrepLeft id
0 1892 83 59 14 chr1 67108753 67109046 -181847376 + L1P5 LINE L1 5301 5607 -544 1
1 2582 27 0 23 chr1 8388315 8388618 -240567804 - AluY SINE Alu -15 296 1 1
1 4085 171 77 36 chr1 25165803 25166380 -223790042 + L1MB5 LINE L1 5567 6174 0 4
1 2285 91 0 13 chr1 33554185 33554483 -215401939 - AluSc SINE Alu -6 303 10 6
1 2451 64 3 26 chr1 41942894 41943205 -207013217 - AluY SINE Alu -7 304 1 8
library(ggplot2)
library(ggpubr)
setwd("/home/bio0/00-NGS/SETDB1_TCGA")
dta <- read.csv("exp.LUNG.FPKM.EpiStem.ggplot.2.csv", header = F)
colnames(dta) <- c("Gene","Symbol","Sample","FPKM","Group")
names(dta)
dta2 <- dta[grep("DNMT1", dta$Symbol), ]
# Computing matrix
computeMatrix scale-regions -S Muscle.2.bin500.bw Muscle.20.bin500.bw Muscle.28.bin500.bw \
--skipZeros -R mm10/$1.bed -o Muscle.$1.bin500.matrix --outFileNameMatrix Muscle.$1.bin500.matrix.tsv \
-bs 10 -a 5000 -b 5000 --regionBodyLength 10000 \
-–blackListFileName blackList.bed # you may mask specific regions with "black list"
# Plotting heatmap
plotHeatmap -m Muscle.$1.bin500.matrix -out Muscle.$1.bin500.Heatmap.png --colorList 'white,black' # --zMax 10
# Plotting profile
dta <- read.csv("file:///C:/Users/bk/Desktop/TEST_data.csv",header = T, row.names = 1)
head(dta)
cl <- kmeans(cntNorm,6)
cluster<-cl$cluster
table(cluster)
rNames <- rownames(cnrNorm)
df<-data.frame(cntNorm, rNames, cluster) # attach cluster info on the data frame
head(df)
# drawing a scatter plot and regression line
# draw a scatter plot
library(LSD)
heatscatter( dta$a, dta$b, cor = TRUE, method = "pearson" )
# add regression line
abline( lm( dta$b ~ dta$a ) ) # switch columns (a<-->b)
# data loading
dta <- read.csv("file:///C:/Users/bk/Desktop/test2.csv",header = T)
dta
# boxplot data
library(ggplot2)
ggplot(dta, aes(x=group,y=count)) +
geom_boxplot()
# linear regression model for data