mbk0asis · April 27, 2016 02:09
diff --git a/DESeq b/DESeq
 library(DESeq)
 library(LSD)
 library(ggplot2)
 library(gridExtra)
 library(RColorBrewer)

 # open a count file
 setwd('/home/bio1/00-NGS/RNAseq/bov/GSE44023_Chitwood_et_al/DESeq')
 setwd('/home/bio1/00-NGS/Cancers/cancer_met_Kyuheum')
 countsTable<- read.csv("final.final.csv",row.names=1)
 countsTable <- read.delim("/home/bio2/00-NGS/Hairpin/Bismark_results/Read1_only/All.genomecov.intersect.txt",row.names=1)
 head(countsTable)
 colnames(countsTable) = c("MEF","P0","P1","P2","P6","J1")
 head(countsTable)
 conds<- factor(c("Biliary","Biliary","Biliary","Biliary","Biliary","Biliary",
                 "Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
                 "Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
                 "Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
                 "Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
                 "Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
                 "Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
                 "Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
                 "Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
                 "Other","Other","Other","Other","Other","Other"))
 conds<- factor(c("IVF","IVF","IVF",
                 "cSCNT","cSCNT","cSCNT","cSCNT",
                 "fSCNT","fSCNT","fSCNT","fSCNT",
                 "TSA_IVF","TSA_SCNT","BM","BF"))

 heatboxplot(log2(countsTable[,1:2]+1),ylim=c(-0.3,20), notch=TRUE,boxonly=F,
            ylab = "log2 counts", main = "Count Distribution")


 # Define column names
 #colnames(countsTable) = c("geneSymbol","MI-1","MI-2","MI-3","MI-4","MI-5","MI-6",
 "MN-1","MN-2","MN-3","MN-4","MN-5","MN-6",
 "MS-1","MS-2","MS-3","MS-4","MS-5","MS-6",
 "FI-1","FI-2","FI-3","FI-4","FI-5","FI-6",
 "FN-1","FN-2","FN-3","FN-4","FN-5","FN-6",
 "FS-1","FS-2","FS-3","FS-4","FS-5"
 ,"BM","BF")
 #head(countsTable)

 # Define conditons of each column (sample)
 conds<- factor(c("MI","MI","MI","MI","MI","MI",
                "MN","MN","MN","MN","MN","MN",
                "MS","MS","MS","MS","MS","MS",
                "FI","FI","FI","FI","FI","FI",
                "FN","FN","FN","FN","FN","FN",
                "FS","FS","FS","FS","FS",
                "BM","BF"))
 conds<- factor(c("MI","MI","MI","MI","MI","MI",
                 "MN","MN","MN","MN","MN","MN",
                 "FI","FI","FI","FI","FI","FI",
                 "FN","FN","FN","FN","FN","FN",
                 "BM","BF"))

 # Set up cds (count data set)
 cds <- newCountDataSet(countsTable,conds)
 head(cds)

 # Estimate sample size of each sample
 cds <- estimateSizeFactors(cds)
 sizeFactors(cds)
 write.table(sizeFactors(cds),'/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/sizeFactor.txt')
 countsNormalized <- counts(cds,normalized=T)
 head(countsNormalized)
 write.csv(countsNormalized,"cntNorm_Old.TSA.BESF.csv")

 # Heatscatter plots
 heatboxplot(log2(countsNormalized+1),ylim=c(-0.3,20), notch=TRUE,boxonly=F,
            ylab = "log2 counts", main = "Count Normalized Distribution")

 # scatter plot - individuals
 heatscatter(log2(countsNormalized[,36]),log2(countsNormalized[,14]),
            xlab="BM (log2 expression)",ylab="MS_2 (log2 expression)",
            cor = TRUE, method = "spearman",
            xlim=c(-0.5,17),ylim=c(-0.5,17))
 abline(-1,1,col="black",lwd=2)
 abline(1,1,col="black",lwd=2)
 identify(log(countsNormalized[,1]),log(countsNormalized[,2]),labels=names(countsNormalized[,1]))

 ####################################################
 # Scatter plots (ggplot2)
 head(countsNormalized)
 p <- qplot(log2(countsNormalized[ ,5]+1),log2(countsNormalized[ ,3]+1),xlab="TSA(+) SCNT", ylab="TSA(-) SCNT")
 p + geom_smooth(method="lm", formula = y ~ x,se=F) + geom_abline(intercept = 0,linetype="dotted") + geom_point(size=2, alpha=0.1) + coord_flip() #+ stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE) 

 # Stat Smooth Function
 library(proto)
 stat_smooth_func <- function (mapping = NULL, data = NULL, geom = "smooth", position = "identity",
                              method = "auto", formula = y ~ x, se = TRUE, n = 80, fullrange = FALSE,
                              level = 0.95, na.rm = FALSE, ...) {
  StatSmoothFunc$new(mapping = mapping, data = data, geom = geom, position = position,
                     method = method, formula = formula, se = se, n = n, fullrange = fullrange,
                     level = level, na.rm = na.rm, ...)
 }

 StatSmoothFunc <- proto(ggplot2:::Stat, {
  objname <- "smooth"
  
  calculate_groups <- function(., data, scales, method="auto", formula=y~x, ...) {
    rows <- daply(data, .(group), function(df) length(unique(df$x)))
    
    if (all(rows == 1) && length(rows) > 1) {
      message("geom_smooth: Only one unique x value each group.",
              "Maybe you want aes(group = 1)?")
      return(data.frame())
    }
    
    # Figure out what type of smoothing to do: loess for small datasets,
    # gam with a cubic regression basis for large data
    # This is based on the size of the _largest_ group.
    if (identical(method, "auto")) {
      groups <- count(data, "group")
      
      if (max(groups$freq) < 1000) {
        method <- "loess"
        message('geom_smooth: method="auto" and size of largest group is <1000,',
                ' so using loess.',
                ' Use \'method = x\' to change the smoothing method.')
      } else {
        method <- "gam"
        formula <- y ~ s(x, bs = "cs")
        message('geom_smooth: method="auto" and size of largest group is >=1000,',
                ' so using gam with formula: y ~ s(x, bs = "cs").',
                ' Use \'method = x\' to change the smoothing method.')
      }
    }
    if (identical(method, "gam")) try_require("mgcv")
    
    .super$calculate_groups(., data, scales, method = method, formula = formula, ...)
  }
  
  calculate <- function(., data, scales, method="auto", formula=y~x, se = TRUE, n=80, fullrange=FALSE, xseq = NULL, level=0.95, na.rm = FALSE, ...) {
    data <- remove_missing(data, na.rm, c("x", "y"), name="stat_smooth")
    if (length(unique(data$x)) < 2) {
      # Not enough data to perform fit
      return(data.frame())
    }
    
    if (is.null(data$weight)) data$weight <- 1
    
    if (is.null(xseq)) {
      if (is.integer(data$x)) {
        if (fullrange) {
          xseq <- scale_dimension(scales$x, c(0, 0))
        } else {
          xseq <- sort(unique(data$x))
        }
      } else {
        if (fullrange) {
          range <- scale_dimension(scales$x, c(0, 0))
        } else {
          range <- range(data$x, na.rm=TRUE)
        }
        xseq <- seq(range[1], range[2], length=n)
      }
    }
    if (is.character(method)) method <- match.fun(method)
    
    method.special <- function(...)
      method(formula, data=data, weights=weight, ...)
    model <- safe.call(method.special, list(...), names(formals(method)))
    
    predictdf(model, xseq, se, level)
    m = model
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
                     list(a = format(coef(m)[1], digits = 3), 
                          b = format(coef(m)[2], digits = 3), 
                          r2 = format(summary(m)$r.squared, digits = 3)))
    func_string = as.character(as.expression(eq))
    
    data.frame(x=min(data$x)*0.9, y=max(data$y)*0.9, label=func_string)
  }
  
  required_aes <- c("x", "y")
  default_geom <- function(.) GeomSmooth
 })
 ###################################################################################

 # Estimate dispersion
 cds <- estimateDispersions(cds)
 cds <- estimateDispersions(cds, method = "blind", fitType = "local", sharingMode = "fit-only")  # for individual comparison

 plotDispEsts(cds)

 #plotPCA
 vsd <- varianceStabilizingTransformation(cds)
 plotPCA(vsd,ntop = 10000)

 # source code for 'plotPCA'
 plotPCA = function(x, intgroup, ntop=500)
 {
  require("lattice")
  require("genefilter")
  rv = rowVars(exprs(x))
  select = order(rv, decreasing=TRUE)[seq_len(ntop)]
  pca = prcomp(t(exprs(x)[select,]))
  fac = factor(apply(pData(vsdFull)[, intgroup], 1, paste, collapse=": "))
  colours = brewer.pal(nlevels(fac), "Paired")   # colors of points
  pcafig = xyplot(PC2 ~ PC1, groups=fac, data=as.data.frame(pca$x), pch=16, cex=2, # can change shapes & sizes of points
    aspect = "iso", col=colours,
    main = draw.key(key = list(
      rect = list(col = colours),
      text = list(levels(fac)),
      rep = FALSE)))
 }

 plotPCA(vsd)
 ##################################################################
 ### MI_vs_MN #####################################################
 ##################################################################
 result_MI_MN = nbinomTest(cds,"MI","MN")
 result_MI_MN = result_MI_MN[order(result_MI_MN$pval), ]
 #write.csv(result_MI_MN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MI_MN.csv")
 ## Histogram of p-val distribution
 # hist(result_MI_MN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 ## MA plot
 with(IVF_TSA, plot(log10(baseMean),log2FoldChange,pch=20,main="MA plot - IVF_TSA",col="lightgrey"))  
 with(subset(IVF_TSA, pval<.05), points(log10(baseMean),log2FoldChange,pch=20, col="red"))
 abline(h=c(-1,1),col="blue")

 #volcano plot
 with(IVF_TSA, plot(log2FoldChange,-log10(pval),pch=19,main="Volcano plot - IVF_TSA",col="lightgrey"))  
 with(subset(IVF_TSA, pval<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pval), pch=20, col="red"))
 abline(v=c(-1,1),col="blue")

 # Label points with the textxy function from the calibrate plot
 library(calibrate)
 with(subset(IVF_TSA, -log10(pval)>40 & abs(log2FoldChange)>5), textxy(log2FoldChange, -log10(pval), labs=id, offset=-0.5,cex=.7))

 ##################################################################
 ### MI_vs_MS #####################################################
 ##################################################################
 result_MI_MS = nbinomTest(cds,"MI","MS")
 result_MI_MS = result_MI_MS[order(result_MI_MS$pval), ]
 #write.csv(result_MI_MS,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MI_MS.csv")
 # Histogram of p-val distribution
 # hist(result_MI_MS$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 # MA plot
 plot(
    result_MI_MS$baseMean,
    result_MI_MS$log2FoldChange,
    log="x",pch=20,cex=.7,
    col=ifelse(result_MI_MS$pval < .05 ,"red","grey"),
    xlab="BaseMean", ylab="FoldChange"
    )
 abline(h=c(-1,1),col="blue")

 ##################################################################
 ### MS_vs_MN #####################################################
 ##################################################################
 result_MS_MN = nbinomTest(cds,"MS","MN")
 result_MS_MN = result_MS_MN[order(result_MS_MN$pval), ]
 #write.csv(result_MS_MN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MS_MN.csv")
 # Histogram of p-val distribution
 # hist(result_MI_MS$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 # MA plot
 plot(
  result_MS_MN$baseMean,
  result_MS_MN$log2FoldChange,
  log="x",pch=20,cex=.7,
  col=ifelse(result_MS_MN$pval < .05 ,"red","grey"),
  xlab="BaseMean", ylab="FoldChange"
 )
 abline(h=c(-1,1),col="blue")

 ##################################################################
 ### FI_vs_FN #####################################################
 ##################################################################
 result_FI_FN = nbinomTest(cds,"FI","FN")
 result_FI_FN = result_FI_FN[order(result_FI_FN$pval), ]
 #write.csv(result_FI_FN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_FI_FN.csv")
 # Histogram of p-val distribution
 # hist(result_FI_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 # MA plot
 plot(
  result_FI_FN$baseMean,
  result_FI_FN$log2FoldChange,
  log="x",pch=20,cex=.7,
  col=ifelse(result_FI_FN$pval < .05 ,"red","grey"),
  xlab="BaseMean", ylab="FoldChange"
 )
 abline(h=c(-1,1),col="blue")

 ##################################################################
 ### FI_vs_FS #####################################################
 ##################################################################
 result_FI_FS = nbinomTest(cds,"FI","FS")
 result_FI_FS = result_FI_FS[order(result_FI_FS$pval), ]
 write.csv(result_FI_FS,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_FI_FS.csv")
 # Histogram of p-val distribution
 # hist(result_FI_FS$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 # MA plot
 plot(
  result_FI_FS$baseMean,
  result_FI_FS$log2FoldChange,
  log="x",pch=20,cex=.7,
  col=ifelse(result_FI_FS$pval < .05 ,"red","grey"),
  xlab="BaseMean", ylab="FoldChange"
 )
 abline(h=c(-1,1),col="blue")


 ##################################################################
 ### FS_vs_FN #####################################################
 ##################################################################
 result_FS_FN = nbinomTest(cds,"FS","FN")
 result_FS_FN = result_FS_FN[order(result_FS_FN$pval), ]
 write.csv(result_FS_FN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_FS_FN.csv")
 # Histogram of p-val distribution
 # hist(result_FS_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 # MA plot
 plot(
  result_FS_FN$baseMean,
  result_FS_FN$log2FoldChange,
  log="x",pch=20,cex=.7,
  col=ifelse(result_FS_FN$pval < .05 ,"red","grey"),
  xlab="BaseMean", ylab="FoldChange"
 )
 abline(h=c(-1,1),col="blue")


 ##################################################################
 ### MI_vs_FI #####################################################
 ##################################################################
 result_MI_FI = nbinomTest(cds,"MI","FI")
 result_MI_FI = result_MI_FI[order(result_MI_FI$pval), ]
 write.csv(result_MI_FI,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MI_FI.csv")
 # Histogram of p-val distribution
 # hist(result_FS_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 # MA plot
 plot(
  result_MI_FI$baseMean,
  result_MI_FI$log2FoldChange,
  log="x",pch=20,cex=.7,
  col=ifelse(result_MI_FI$pval < .05 ,"red","grey"),
  xlab="BaseMean", ylab="FoldChange"
 )
 abline(h=c(-1,1),col="blue")

 ##################################################################
 ### MN_vs_FN #####################################################
 ##################################################################
 result_MN_FN = nbinomTest(cds,"MN","FN")
 result_MN_FN = result_MN_FN[order(result_MN_FN$pval), ]
 write.csv(result_MN_FN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MN_FN.csv")
 # Histogram of p-val distribution
 # hist(result_FS_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 # MA plot
 plot(
  result_MN_FN$baseMean,
  result_MN_FN$log2FoldChange,
  log="x",pch=20,cex=.7,
  col=ifelse(result_MN_FN$pval < .05 ,"red","grey"),
  xlab="BaseMean", ylab="FoldChange"
 )
 abline(h=c(-1,1),col="blue")

 ##################################################################
 ### MS_vs_FS #####################################################
 ##################################################################
 result_MS_FS = nbinomTest(cds,"MS","FS")
 result_MS_FS = result_MS_FS[order(result_MS_FS$pval), ]
 write.csv(result_MS_FS,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MS_FS.csv")
 # Histogram of p-val distribution
 # hist(result_FS_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
 # MA plot
 plot(
  result_MS_FS$baseMean,
  result_MS_FS$log2FoldChange,
  log="x",pch=20,cex=.7,
  col=ifelse(result_MS_FS$pval < .05 ,"red","grey"),
  xlab="BaseMean", ylab="FoldChange"
 )
 abline(h=c(-1,1),col="blue")

 ## END ##########################################################

 #####################################################################
 ## Merge DESeq_results + cntNorm                                   ##
 ## Extract rows with p <0.05                                       ##
 #####################################################################

 cntNorm <- read.csv('~/00-NGS//RNAseq//bov//niceM//DESeq/cntNorm_ALL.csv')
 head(cntNorm)
 nrow(cntNorm)

 # within Males
 MI_MN.merged <- merge(x=result_MI_MN, y=cntNorm, by.x="id", by.y="geneSymbol")
 MI_MN.merged.p05 <- MI_MN.merged[MI_MN.merged$pval < 0.05, ]
 MI_MN.merged.p05 = MI_MN.merged.p05[order(MI_MN.merged.p05$pval), ]
 nrow(MI_MN.merged.p05)
 write.table(MI_MN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MI_MN.merged.p05")

 MI_MS.merged <- merge(x=result_MI_MS, y=cntNorm, by.x="id", by.y="geneSymbol")
 MI_MS.merged.p05 <- MI_MS.merged[MI_MS.merged$pval < 0.05, ]
 MI_MS.merged.p05 = MI_MS.merged.p05[order(MI_MS.merged.p05$pval), ]
 nrow(MI_MS.merged.p05)
 write.table(MI_MS.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MI_MS.merged.p05")

 MS_MN.merged <- merge(x=result_MS_MN, y=cntNorm, by.x="id", by.y="geneSymbol")
 MS_MN.merged.p05 <- MS_MN.merged[MS_MN.merged$pval < 0.05, ]
 MS_MN.merged.p05 = MS_MN.merged.p05[order(MS_MN.merged.p05$pval), ]
 nrow(MS_MN.merged.p05)
 write.table(MS_MN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MS_MN.merged.p05")

 # within Females
 FI_FN.merged <- merge(x=result_FI_FN, y=cntNorm, by.x="id", by.y="geneSymbol")
 FI_FN.merged.p05 <- FI_FN.merged[FI_FN.merged$pval < 0.05, ]
 FI_FN.merged.p05 = FI_FN.merged.p05[order(FI_FN.merged.p05$pval), ]
 nrow(FI_FN.merged.p05)
 write.table(FI_FN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/FI_FN.merged.p05")

 FI_FS.merged <- merge(x=result_FI_FS, y=cntNorm, by.x="id", by.y="geneSymbol")
 FI_FS.merged.p05 <- FI_FS.merged[FI_FS.merged$pval < 0.05, ]
 FI_FS.merged.p05 = FI_FS.merged.p05[order(FI_FS.merged.p05$pval), ]
 nrow(FI_FS.merged.p05)
 write.table(FI_FS.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/FI_FS.merged.p05")

 FS_FN.merged <- merge(x=result_FS_FN, y=cntNorm, by.x="id", by.y="geneSymbol")
 FS_FN.merged.p05 <- FS_FN.merged[FS_FN.merged$pval < 0.05, ]
 FS_FN.merged.p05 = FS_FN.merged.p05[order(FS_FN.merged.p05$pval), ]
 nrow(FS_FN.merged.p05)
 write.table(FS_FN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/FS_FN.merged.p05")

 ## Male vs Female
 MI_FI.merged <- merge(x=result_MI_FI, y=cntNorm, by.x="id", by.y="geneSymbol")
 MI_FI.merged.p05 <- MI_FI.merged[MI_FI.merged$pval < 0.05, ]
 MI_FI.merged.p05 = MI_FI.merged.p05[order(MI_FI.merged.p05$pval), ]
 nrow(MI_FI.merged.p05)
 write.table(MI_FI.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MI_FI.merged.p05")

 MN_FN.merged <- merge(x=result_MN_FN, y=cntNorm, by.x="id", by.y="geneSymbol")
 MN_FN.merged.p05 <- MN_FN.merged[MN_FN.merged$pval < 0.05, ]
 MN_FN.merged.p05 = MN_FN.merged.p05[order(MN_FN.merged.p05$pval), ]
 nrow(MN_FN.merged.p05)
 write.table(MN_FN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MN_FN.merged.p05")

 MS_FS.merged <- merge(x=result_MS_FS, y=cntNorm, by.x="id", by.y="geneSymbol")
 MS_FS.merged.p05 <- MS_FS.merged[MS_FS.merged$pval < 0.05, ]
 MS_FS.merged.p05 = MS_FS.merged.p05[order(MS_FS.merged.p05$pval), ]
 nrow(MS_FS.merged.p05)
 write.table(MS_FS.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MS_FS.merged.p05")

 #############################################################
 ##  Male Common DEGs                                       ##
 #############################################################

 MI_MN.MI_MS.merged <- merge(x=result_MI_MN, y=result_MI_MN, by.x="id", by.y="id")
 MI_MN_MS.merged <- merge(x=MI_MN.MI_MS.merged, y=result_MS_MN, by.x="id", by.y="id")
 common_DEG_Male <- MI_MN_MS.merged[MI_MN_MS.merged$pval.x < 0.05, ]
 common_DEG_Male <- common_DEG_Male[common_DEG_Male$pval.y < 0.05, ]
 common_DEG_Male <- common_DEG_Male[common_DEG_Male$pval < 0.05, ]
 write.table(common_DEG_Male,'~/00-NGS//RNAseq//bov//niceM//DESeq/commonDEG_Male_p05.txt')


 #############################################################
 ##  Feale Common DEGs                                      ##
 #############################################################

 FI_FN.FI_FS.merged <- merge(x=result_FI_FN, y=result_FI_FN, by.x="id", by.y="id")
 FI_FN_FS.merged <- merge(x=FI_FN.FI_FS.merged, y=result_FS_FN, by.x="id", by.y="id")
 common_DEG_Female <- FI_FN_FS.merged[FI_FN_FS.merged$pval.x < 0.05, ]
 common_DEG_Female <- common_DEG_Female[common_DEG_Female$pval.y < 0.05, ]
 common_DEG_Female <- common_DEG_Female[common_DEG_Female$pval < 0.05, ]
 write.table(common_DEG_Female,'~/00-NGS//RNAseq//bov//niceM//DESeq/commonDEG_Female_p05.txt')
	library(DESeq)
	library(LSD)
	library(ggplot2)
	library(gridExtra)
	library(RColorBrewer)

	# open a count file
	setwd('/home/bio1/00-NGS/RNAseq/bov/GSE44023_Chitwood_et_al/DESeq')
	setwd('/home/bio1/00-NGS/Cancers/cancer_met_Kyuheum')
	countsTable<- read.csv("final.final.csv",row.names=1)
	countsTable <- read.delim("/home/bio2/00-NGS/Hairpin/Bismark_results/Read1_only/All.genomecov.intersect.txt",row.names=1)
	head(countsTable)
	colnames(countsTable) = c("MEF","P0","P1","P2","P6","J1")
	head(countsTable)
	conds<- factor(c("Biliary","Biliary","Biliary","Biliary","Biliary","Biliary",
	"Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
	"Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
	"Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
	"Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
	"Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
	"Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
	"Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
	"Other","Other","Other","Other","Other","Other","Other","Other","Other","Other",
	"Other","Other","Other","Other","Other","Other"))
	conds<- factor(c("IVF","IVF","IVF",
	"cSCNT","cSCNT","cSCNT","cSCNT",
	"fSCNT","fSCNT","fSCNT","fSCNT",
	"TSA_IVF","TSA_SCNT","BM","BF"))

	heatboxplot(log2(countsTable[,1:2]+1),ylim=c(-0.3,20), notch=TRUE,boxonly=F,
	ylab = "log2 counts", main = "Count Distribution")


	# Define column names
	#colnames(countsTable) = c("geneSymbol","MI-1","MI-2","MI-3","MI-4","MI-5","MI-6",
	"MN-1","MN-2","MN-3","MN-4","MN-5","MN-6",
	"MS-1","MS-2","MS-3","MS-4","MS-5","MS-6",
	"FI-1","FI-2","FI-3","FI-4","FI-5","FI-6",
	"FN-1","FN-2","FN-3","FN-4","FN-5","FN-6",
	"FS-1","FS-2","FS-3","FS-4","FS-5"
	,"BM","BF")
	#head(countsTable)

	# Define conditons of each column (sample)
	conds<- factor(c("MI","MI","MI","MI","MI","MI",
	"MN","MN","MN","MN","MN","MN",
	"MS","MS","MS","MS","MS","MS",
	"FI","FI","FI","FI","FI","FI",
	"FN","FN","FN","FN","FN","FN",
	"FS","FS","FS","FS","FS",
	"BM","BF"))
	conds<- factor(c("MI","MI","MI","MI","MI","MI",
	"MN","MN","MN","MN","MN","MN",
	"FI","FI","FI","FI","FI","FI",
	"FN","FN","FN","FN","FN","FN",
	"BM","BF"))

	# Set up cds (count data set)
	cds <- newCountDataSet(countsTable,conds)
	head(cds)

	# Estimate sample size of each sample
	cds <- estimateSizeFactors(cds)
	sizeFactors(cds)
	write.table(sizeFactors(cds),'/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/sizeFactor.txt')
	countsNormalized <- counts(cds,normalized=T)
	head(countsNormalized)
	write.csv(countsNormalized,"cntNorm_Old.TSA.BESF.csv")

	# Heatscatter plots
	heatboxplot(log2(countsNormalized+1),ylim=c(-0.3,20), notch=TRUE,boxonly=F,
	ylab = "log2 counts", main = "Count Normalized Distribution")

	# scatter plot - individuals
	heatscatter(log2(countsNormalized[,36]),log2(countsNormalized[,14]),
	xlab="BM (log2 expression)",ylab="MS_2 (log2 expression)",
	cor = TRUE, method = "spearman",
	xlim=c(-0.5,17),ylim=c(-0.5,17))
	abline(-1,1,col="black",lwd=2)
	abline(1,1,col="black",lwd=2)
	identify(log(countsNormalized[,1]),log(countsNormalized[,2]),labels=names(countsNormalized[,1]))

	####################################################
	# Scatter plots (ggplot2)
	head(countsNormalized)
	p <- qplot(log2(countsNormalized[ ,5]+1),log2(countsNormalized[ ,3]+1),xlab="TSA(+) SCNT", ylab="TSA(-) SCNT")
	p + geom_smooth(method="lm", formula = y ~ x,se=F) + geom_abline(intercept = 0,linetype="dotted") + geom_point(size=2, alpha=0.1) + coord_flip() #+ stat_smooth_func(geom="text",method="lm",hjust=0,parse=TRUE)

	# Stat Smooth Function
	library(proto)
	stat_smooth_func <- function (mapping = NULL, data = NULL, geom = "smooth", position = "identity",
	method = "auto", formula = y ~ x, se = TRUE, n = 80, fullrange = FALSE,
	level = 0.95, na.rm = FALSE, ...) {
	StatSmoothFunc$new(mapping = mapping, data = data, geom = geom, position = position,
	method = method, formula = formula, se = se, n = n, fullrange = fullrange,
	level = level, na.rm = na.rm, ...)
	}

	StatSmoothFunc <- proto(ggplot2:::Stat, {
	objname <- "smooth"

	calculate_groups <- function(., data, scales, method="auto", formula=y~x, ...) {
	rows <- daply(data, .(group), function(df) length(unique(df$x)))

	if (all(rows == 1) && length(rows) > 1) {
	message("geom_smooth: Only one unique x value each group.",
	"Maybe you want aes(group = 1)?")
	return(data.frame())
	}

	# Figure out what type of smoothing to do: loess for small datasets,
	# gam with a cubic regression basis for large data
	# This is based on the size of the _largest_ group.
	if (identical(method, "auto")) {
	groups <- count(data, "group")

	if (max(groups$freq) < 1000) {
	method <- "loess"
	message('geom_smooth: method="auto" and size of largest group is <1000,',
	' so using loess.',
	' Use \'method = x\' to change the smoothing method.')
	} else {
	method <- "gam"
	formula <- y ~ s(x, bs = "cs")
	message('geom_smooth: method="auto" and size of largest group is >=1000,',
	' so using gam with formula: y ~ s(x, bs = "cs").',
	' Use \'method = x\' to change the smoothing method.')
	}
	}
	if (identical(method, "gam")) try_require("mgcv")

	.super$calculate_groups(., data, scales, method = method, formula = formula, ...)
	}

	calculate <- function(., data, scales, method="auto", formula=y~x, se = TRUE, n=80, fullrange=FALSE, xseq = NULL, level=0.95, na.rm = FALSE, ...) {
	data <- remove_missing(data, na.rm, c("x", "y"), name="stat_smooth")
	if (length(unique(data$x)) < 2) {
	# Not enough data to perform fit
	return(data.frame())
	}

	if (is.null(data$weight)) data$weight <- 1

	if (is.null(xseq)) {
	if (is.integer(data$x)) {
	if (fullrange) {
	xseq <- scale_dimension(scales$x, c(0, 0))
	} else {
	xseq <- sort(unique(data$x))
	}
	} else {
	if (fullrange) {
	range <- scale_dimension(scales$x, c(0, 0))
	} else {
	range <- range(data$x, na.rm=TRUE)
	}
	xseq <- seq(range[1], range[2], length=n)
	}
	}
	if (is.character(method)) method <- match.fun(method)

	method.special <- function(...)
	method(formula, data=data, weights=weight, ...)
	model <- safe.call(method.special, list(...), names(formals(method)))

	predictdf(model, xseq, se, level)
	m = model
	eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
	list(a = format(coef(m)[1], digits = 3),
	b = format(coef(m)[2], digits = 3),
	r2 = format(summary(m)$r.squared, digits = 3)))
	func_string = as.character(as.expression(eq))

	data.frame(x=min(data$x)0.9, y=max(data$y)0.9, label=func_string)
	}

	required_aes <- c("x", "y")
	default_geom <- function(.) GeomSmooth
	})
	###################################################################################

	# Estimate dispersion
	cds <- estimateDispersions(cds)
	cds <- estimateDispersions(cds, method = "blind", fitType = "local", sharingMode = "fit-only") # for individual comparison

	plotDispEsts(cds)

	#plotPCA
	vsd <- varianceStabilizingTransformation(cds)
	plotPCA(vsd,ntop = 10000)

	# source code for 'plotPCA'
	plotPCA = function(x, intgroup, ntop=500)
	{
	require("lattice")
	require("genefilter")
	rv = rowVars(exprs(x))
	select = order(rv, decreasing=TRUE)[seq_len(ntop)]
	pca = prcomp(t(exprs(x)[select,]))
	fac = factor(apply(pData(vsdFull)[, intgroup], 1, paste, collapse=": "))
	colours = brewer.pal(nlevels(fac), "Paired") # colors of points
	pcafig = xyplot(PC2 ~ PC1, groups=fac, data=as.data.frame(pca$x), pch=16, cex=2, # can change shapes & sizes of points
	aspect = "iso", col=colours,
	main = draw.key(key = list(
	rect = list(col = colours),
	text = list(levels(fac)),
	rep = FALSE)))
	}

	plotPCA(vsd)
	##################################################################
	### MI_vs_MN #####################################################
	##################################################################
	result_MI_MN = nbinomTest(cds,"MI","MN")
	result_MI_MN = result_MI_MN[order(result_MI_MN$pval), ]
	#write.csv(result_MI_MN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MI_MN.csv")
	## Histogram of p-val distribution
	# hist(result_MI_MN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	## MA plot
	with(IVF_TSA, plot(log10(baseMean),log2FoldChange,pch=20,main="MA plot - IVF_TSA",col="lightgrey"))
	with(subset(IVF_TSA, pval<.05), points(log10(baseMean),log2FoldChange,pch=20, col="red"))
	abline(h=c(-1,1),col="blue")

	#volcano plot
	with(IVF_TSA, plot(log2FoldChange,-log10(pval),pch=19,main="Volcano plot - IVF_TSA",col="lightgrey"))
	with(subset(IVF_TSA, pval<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pval), pch=20, col="red"))
	abline(v=c(-1,1),col="blue")

	# Label points with the textxy function from the calibrate plot
	library(calibrate)
	with(subset(IVF_TSA, -log10(pval)>40 & abs(log2FoldChange)>5), textxy(log2FoldChange, -log10(pval), labs=id, offset=-0.5,cex=.7))

	##################################################################
	### MI_vs_MS #####################################################
	##################################################################
	result_MI_MS = nbinomTest(cds,"MI","MS")
	result_MI_MS = result_MI_MS[order(result_MI_MS$pval), ]
	#write.csv(result_MI_MS,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MI_MS.csv")
	# Histogram of p-val distribution
	# hist(result_MI_MS$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	# MA plot
	plot(
	result_MI_MS$baseMean,
	result_MI_MS$log2FoldChange,
	log="x",pch=20,cex=.7,
	col=ifelse(result_MI_MS$pval < .05 ,"red","grey"),
	xlab="BaseMean", ylab="FoldChange"
	)
	abline(h=c(-1,1),col="blue")

	##################################################################
	### MS_vs_MN #####################################################
	##################################################################
	result_MS_MN = nbinomTest(cds,"MS","MN")
	result_MS_MN = result_MS_MN[order(result_MS_MN$pval), ]
	#write.csv(result_MS_MN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MS_MN.csv")
	# Histogram of p-val distribution
	# hist(result_MI_MS$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	# MA plot
	plot(
	result_MS_MN$baseMean,
	result_MS_MN$log2FoldChange,
	log="x",pch=20,cex=.7,
	col=ifelse(result_MS_MN$pval < .05 ,"red","grey"),
	xlab="BaseMean", ylab="FoldChange"
	)
	abline(h=c(-1,1),col="blue")

	##################################################################
	### FI_vs_FN #####################################################
	##################################################################
	result_FI_FN = nbinomTest(cds,"FI","FN")
	result_FI_FN = result_FI_FN[order(result_FI_FN$pval), ]
	#write.csv(result_FI_FN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_FI_FN.csv")
	# Histogram of p-val distribution
	# hist(result_FI_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	# MA plot
	plot(
	result_FI_FN$baseMean,
	result_FI_FN$log2FoldChange,
	log="x",pch=20,cex=.7,
	col=ifelse(result_FI_FN$pval < .05 ,"red","grey"),
	xlab="BaseMean", ylab="FoldChange"
	)
	abline(h=c(-1,1),col="blue")

	##################################################################
	### FI_vs_FS #####################################################
	##################################################################
	result_FI_FS = nbinomTest(cds,"FI","FS")
	result_FI_FS = result_FI_FS[order(result_FI_FS$pval), ]
	write.csv(result_FI_FS,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_FI_FS.csv")
	# Histogram of p-val distribution
	# hist(result_FI_FS$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	# MA plot
	plot(
	result_FI_FS$baseMean,
	result_FI_FS$log2FoldChange,
	log="x",pch=20,cex=.7,
	col=ifelse(result_FI_FS$pval < .05 ,"red","grey"),
	xlab="BaseMean", ylab="FoldChange"
	)
	abline(h=c(-1,1),col="blue")


	##################################################################
	### FS_vs_FN #####################################################
	##################################################################
	result_FS_FN = nbinomTest(cds,"FS","FN")
	result_FS_FN = result_FS_FN[order(result_FS_FN$pval), ]
	write.csv(result_FS_FN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_FS_FN.csv")
	# Histogram of p-val distribution
	# hist(result_FS_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	# MA plot
	plot(
	result_FS_FN$baseMean,
	result_FS_FN$log2FoldChange,
	log="x",pch=20,cex=.7,
	col=ifelse(result_FS_FN$pval < .05 ,"red","grey"),
	xlab="BaseMean", ylab="FoldChange"
	)
	abline(h=c(-1,1),col="blue")


	##################################################################
	### MI_vs_FI #####################################################
	##################################################################
	result_MI_FI = nbinomTest(cds,"MI","FI")
	result_MI_FI = result_MI_FI[order(result_MI_FI$pval), ]
	write.csv(result_MI_FI,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MI_FI.csv")
	# Histogram of p-val distribution
	# hist(result_FS_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	# MA plot
	plot(
	result_MI_FI$baseMean,
	result_MI_FI$log2FoldChange,
	log="x",pch=20,cex=.7,
	col=ifelse(result_MI_FI$pval < .05 ,"red","grey"),
	xlab="BaseMean", ylab="FoldChange"
	)
	abline(h=c(-1,1),col="blue")

	##################################################################
	### MN_vs_FN #####################################################
	##################################################################
	result_MN_FN = nbinomTest(cds,"MN","FN")
	result_MN_FN = result_MN_FN[order(result_MN_FN$pval), ]
	write.csv(result_MN_FN,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MN_FN.csv")
	# Histogram of p-val distribution
	# hist(result_FS_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	# MA plot
	plot(
	result_MN_FN$baseMean,
	result_MN_FN$log2FoldChange,
	log="x",pch=20,cex=.7,
	col=ifelse(result_MN_FN$pval < .05 ,"red","grey"),
	xlab="BaseMean", ylab="FoldChange"
	)
	abline(h=c(-1,1),col="blue")

	##################################################################
	### MS_vs_FS #####################################################
	##################################################################
	result_MS_FS = nbinomTest(cds,"MS","FS")
	result_MS_FS = result_MS_FS[order(result_MS_FS$pval), ]
	write.csv(result_MS_FS,"/home/bio1/00-NGS//RNAseq//bov//niceM//DESeq/DESeq_MS_FS.csv")
	# Histogram of p-val distribution
	# hist(result_FS_FN$pval, breaks=100, col="skyblue",border="slateblue",main="",xlab="p-value", ylab="Frequency")
	# MA plot
	plot(
	result_MS_FS$baseMean,
	result_MS_FS$log2FoldChange,
	log="x",pch=20,cex=.7,
	col=ifelse(result_MS_FS$pval < .05 ,"red","grey"),
	xlab="BaseMean", ylab="FoldChange"
	)
	abline(h=c(-1,1),col="blue")

	## END ##########################################################

	#####################################################################
	## Merge DESeq_results + cntNorm ##
	## Extract rows with p <0.05 ##
	#####################################################################

	cntNorm <- read.csv('~/00-NGS//RNAseq//bov//niceM//DESeq/cntNorm_ALL.csv')
	head(cntNorm)
	nrow(cntNorm)

	# within Males
	MI_MN.merged <- merge(x=result_MI_MN, y=cntNorm, by.x="id", by.y="geneSymbol")
	MI_MN.merged.p05 <- MI_MN.merged[MI_MN.merged$pval < 0.05, ]
	MI_MN.merged.p05 = MI_MN.merged.p05[order(MI_MN.merged.p05$pval), ]
	nrow(MI_MN.merged.p05)
	write.table(MI_MN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MI_MN.merged.p05")

	MI_MS.merged <- merge(x=result_MI_MS, y=cntNorm, by.x="id", by.y="geneSymbol")
	MI_MS.merged.p05 <- MI_MS.merged[MI_MS.merged$pval < 0.05, ]
	MI_MS.merged.p05 = MI_MS.merged.p05[order(MI_MS.merged.p05$pval), ]
	nrow(MI_MS.merged.p05)
	write.table(MI_MS.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MI_MS.merged.p05")

	MS_MN.merged <- merge(x=result_MS_MN, y=cntNorm, by.x="id", by.y="geneSymbol")
	MS_MN.merged.p05 <- MS_MN.merged[MS_MN.merged$pval < 0.05, ]
	MS_MN.merged.p05 = MS_MN.merged.p05[order(MS_MN.merged.p05$pval), ]
	nrow(MS_MN.merged.p05)
	write.table(MS_MN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MS_MN.merged.p05")

	# within Females
	FI_FN.merged <- merge(x=result_FI_FN, y=cntNorm, by.x="id", by.y="geneSymbol")
	FI_FN.merged.p05 <- FI_FN.merged[FI_FN.merged$pval < 0.05, ]
	FI_FN.merged.p05 = FI_FN.merged.p05[order(FI_FN.merged.p05$pval), ]
	nrow(FI_FN.merged.p05)
	write.table(FI_FN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/FI_FN.merged.p05")

	FI_FS.merged <- merge(x=result_FI_FS, y=cntNorm, by.x="id", by.y="geneSymbol")
	FI_FS.merged.p05 <- FI_FS.merged[FI_FS.merged$pval < 0.05, ]
	FI_FS.merged.p05 = FI_FS.merged.p05[order(FI_FS.merged.p05$pval), ]
	nrow(FI_FS.merged.p05)
	write.table(FI_FS.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/FI_FS.merged.p05")

	FS_FN.merged <- merge(x=result_FS_FN, y=cntNorm, by.x="id", by.y="geneSymbol")
	FS_FN.merged.p05 <- FS_FN.merged[FS_FN.merged$pval < 0.05, ]
	FS_FN.merged.p05 = FS_FN.merged.p05[order(FS_FN.merged.p05$pval), ]
	nrow(FS_FN.merged.p05)
	write.table(FS_FN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/FS_FN.merged.p05")

	## Male vs Female
	MI_FI.merged <- merge(x=result_MI_FI, y=cntNorm, by.x="id", by.y="geneSymbol")
	MI_FI.merged.p05 <- MI_FI.merged[MI_FI.merged$pval < 0.05, ]
	MI_FI.merged.p05 = MI_FI.merged.p05[order(MI_FI.merged.p05$pval), ]
	nrow(MI_FI.merged.p05)
	write.table(MI_FI.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MI_FI.merged.p05")

	MN_FN.merged <- merge(x=result_MN_FN, y=cntNorm, by.x="id", by.y="geneSymbol")
	MN_FN.merged.p05 <- MN_FN.merged[MN_FN.merged$pval < 0.05, ]
	MN_FN.merged.p05 = MN_FN.merged.p05[order(MN_FN.merged.p05$pval), ]
	nrow(MN_FN.merged.p05)
	write.table(MN_FN.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MN_FN.merged.p05")

	MS_FS.merged <- merge(x=result_MS_FS, y=cntNorm, by.x="id", by.y="geneSymbol")
	MS_FS.merged.p05 <- MS_FS.merged[MS_FS.merged$pval < 0.05, ]
	MS_FS.merged.p05 = MS_FS.merged.p05[order(MS_FS.merged.p05$pval), ]
	nrow(MS_FS.merged.p05)
	write.table(MS_FS.merged.p05,"~/00-NGS//RNAseq/bov/niceM/DESeq/MS_FS.merged.p05")

	#############################################################
	## Male Common DEGs ##
	#############################################################

	MI_MN.MI_MS.merged <- merge(x=result_MI_MN, y=result_MI_MN, by.x="id", by.y="id")
	MI_MN_MS.merged <- merge(x=MI_MN.MI_MS.merged, y=result_MS_MN, by.x="id", by.y="id")
	common_DEG_Male <- MI_MN_MS.merged[MI_MN_MS.merged$pval.x < 0.05, ]
	common_DEG_Male <- common_DEG_Male[common_DEG_Male$pval.y < 0.05, ]
	common_DEG_Male <- common_DEG_Male[common_DEG_Male$pval < 0.05, ]
	write.table(common_DEG_Male,'~/00-NGS//RNAseq//bov//niceM//DESeq/commonDEG_Male_p05.txt')


	#############################################################
	## Feale Common DEGs ##
	#############################################################

	FI_FN.FI_FS.merged <- merge(x=result_FI_FN, y=result_FI_FN, by.x="id", by.y="id")
	FI_FN_FS.merged <- merge(x=FI_FN.FI_FS.merged, y=result_FS_FN, by.x="id", by.y="id")
	common_DEG_Female <- FI_FN_FS.merged[FI_FN_FS.merged$pval.x < 0.05, ]
	common_DEG_Female <- common_DEG_Female[common_DEG_Female$pval.y < 0.05, ]
	common_DEG_Female <- common_DEG_Female[common_DEG_Female$pval < 0.05, ]
	write.table(common_DEG_Female,'~/00-NGS//RNAseq//bov//niceM//DESeq/commonDEG_Female_p05.txt')