Skip to content

Instantly share code, notes, and snippets.

View mdozmorov's full-sized avatar

Mikhail Dozmorov mdozmorov

View GitHub Profile
# PCA: Check for batch effects. Select one batch, to color points by its assignment
pca <- mtx %>% varFilter(., var.cutoff = 0.75) %>% scale %>% t %>% prcomp
colorby <- "Group" # covariates[2]
pt <- ggplot(data = data.frame(pca$x, annot),
aes(x = as.numeric(PC1), y = as.numeric(PC2), label = Sample)) +
theme(plot.title = element_text(lineheight = 0.8, face="bold")) +
ggtitle(paste("PCA with batch, coloring by ", colorby)) +
geom_point(aes(color = eval(parse(text = colorby))), size = 3) +
geom_text_repel(colour = "black", size = 3) +
@mdozmorov
mdozmorov / check_phred.sh
Created May 18, 2019 01:55
Check Phred offset
# https://www.biostars.org/p/63225/
FILE=VLI9_AA_S60_L008_R1_001.fastq.gz
zcat $FILE | head -n 40 | awk '{if(NR%4==0) printf("%s",$0);}' | od -A n -t u1 | awk 'BEGIN{min=100;max=0;}{for(i=1;i<=NF;i++) {if($i>max) max=$i; if($i<min) min=$i;}}END{if(max<=74 && min<59) print "Phred+33"; else if(max>73 && min>=64) print "Phred+64"; else if(min>=59 && min<64 && max>73) print "Solexa+64"; else print "Unknown score encoding";}'
# Exploratory data analysis of SCSig collection: Signatures of Single Cell Identities
# http://www.gsea-msigdb.org/gsea/msigdb/supplementary_genesets.jsp
library(dplyr)
library(clusterProfiler)
# Read in gene sets
mtx <- read.gmt("http://software.broadinstitute.org/gsea/msigdb/supplemental/scsig.all.v1.0.symbols.gmt")
# Number of unique gene signatures
mtx$ont %>% unique() %>% length()
# Summary statistics on size of gene signatures
mtx %>% group_by(ont) %>% summarise(size = n()) %>% select(size) %>% summary()
# Tesseract Intro: https://cran.r-project.org/web/packages/tesseract/vignettes/intro.html
# Image source: https://twitter.com/davidsuter_epfl/status/1141959559654846464?s=20
library(tesseract)
eng <- tesseract("eng")
text <- tesseract::ocr("https://pbs.twimg.com/media/D9kOESnW4AEs90l?format=jpg&name=900x900", engine = eng)
cat(text)
@mdozmorov
mdozmorov / liftOver.R
Created November 27, 2020 01:33
How to liftOver Paired BED data
# How to liftOver Paired BED data
library(dplyr)
library(tidyr)
library(readxl)
library(rtracklayer)
# Landscape of Cohesin-Mediated Chromatin Loops in the Human Genome
# https://www.nature.com/articles/s41586-020-2151-x
# Supplementary Table 4 | Pan-cell type cohesin-mediated chromatin loops, hg19 coordinates, paired BED data
url1 <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-2151-x/MediaObjects/41586_2020_2151_MOESM5_ESM.xlsx"
@mdozmorov
mdozmorov / gist_T2T_excluderanges.R
Last active September 19, 2022 19:59
T2T excluderanges download
# Download a list of problematic regions (aka blacklist) for the T2T-CHM13
# telomere-to-telomere human genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.
suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "6548a002754cc1e882035293541b59a8"
# Construct output file name
@mdozmorov
mdozmorov / gist_mm39_excluderanges.R
Created September 19, 2022 23:39
mm39 excluderanges download
# Download a list of problematic regions (aka blacklist) for the GRCm39/mm39
# mouse genome assembly. Defined by the Boyle-Lab/Blacklist
# software, High Signal and Low Mappability regions.
# See https://github.com/dozmorovlab/excluderanges for more information.
suppressMessages(library(httr)) # https://CRAN.R-project.org/package=httr
suppressMessages(library(GenomicRanges)) # https://bioconductor.org/packages/GenomicRanges/
# bedbase_id
bedbase_id <- "edc716833d4b5ee75c34a0692fc353d5"
# Construct output file name