Skip to content

Instantly share code, notes, and snippets.

View wdecoster's full-sized avatar
🐗
Fork me, and then just push me, until I get your, contribution

Wouter De Coster wdecoster

🐗
Fork me, and then just push me, until I get your, contribution
View GitHub Profile
library(ggfortify)
GenesAndModules <- as.data.frame(t(rbind(datExpr, module=moduleColors)))
modules = unique(GenesAndModules[,c("module")])
for (mod in modules) {
moduledata <- subset(GenesAndModules, module == mod)
moduledata <- t(as.data.frame(lapply(moduledata[, !names(moduledata) %in% c("symbol", "module")], as.numeric)))
Traits$threegroups <- as.factor(Traits$threegroups)
levels(Traits$threegroups) <- c("CON", "uPAT", "kPAT")
pca <- autoplot(prcomp(moduledata), data = Traits, colour='threegroups', frame = TRUE, frame.type = 'norm') + ggtitle(mod)
#Make venn diagram out of two or three lists
#wdecoster
import sys, os, collections, matplotlib, subprocess
matplotlib.use('Agg')
from matplotlib_venn import venn3, venn2
import matplotlib.pyplot as plt
if '-h=1' in sys.argv:
header=True
#wdecoster
import sys, os
from Bio import SeqIO
def createDict(tsvfile):
'''Input is a tsv file and returned is a dictionary with key=chromosome and value=list of tuples with positions'''
if os.path.isfile(tsvfile):
chrdict = { 'chr1' : [], 'chr2' : [], 'chr3' : [], 'chr4' : [], 'chr5' : [], 'chr6' : [],
'chr7' : [], 'chr8' : [], 'chr9' : [], 'chr10' : [],'chr11' : [], 'chr12' : [],
from Bio import Entrez
Entrez.email = "youremail@example.com" # Always tell NCBI who you are
import sys
with open(sys.argv[1]) as genefile:
genes = [line.strip() for line in genefile.readlines() if not line == ""]
for gene in genes:
handle = Entrez.esearch(db='gene', term=gene + '[sym] AND "Homo sapiens"[porgn]')
result = Entrez.read(handle)
import sys
from Bio import SeqIO
def findstart(sequence):
"""
Takes a sequence, locates the start codon, returns the rest of the sequence split by codons
"""
try: #We are not sure a start codon is present, therefore I use the try-except construct to catch potential errors without crashing the script
start = sequence.index("ATG") #the string method .index() returns the first position of "ATG" if any is present, if not it gives a ValueError
pseudoCDS = sequence[start:] #Create a new string starting from the start codon
#Convert gene identifiers in a countfile to those mined from a gtf file, swapping gene_id to gene_name attribute
#Execute as `python renameCountIdentifiers.py yourannotation.gtf yourcountfile.txt`
#Question for which this was written: https://www.biostars.org/p/221278/
#Did I mention I like dict comprehensions, thereby sacrificing code readability?
#wdecoster
import sys
def gtf2dict(gtfF):
"""
import sys, os, glob
from Bio import SeqIO
def getfiles():
files = glob.glob("*.csv") + glob.glob(".tfa")
print("Found {} files to filter".format(len(files)))
return(files)
def filter(inputfile):
countAll = 0
@wdecoster
wdecoster / DEA.R
Last active December 9, 2016 17:26
#!/usr/bin/Rscript
#Script to automate differential expression analysis using one of the following R algorithms: DESeq2, edgeR, DEXSeq or limma
#Based on manuals, pieces of code found on the internet and helpful comments of colleagues
###Required input is:###
#1) Either a matrix of counts (features*samples) with features (genes) on lines and samples on columns OR a directory of bam files to use featureCounts on
#2) A sampleInfo File matching the samples containing at least the fields 'condition' and 'gender'. Reference level is 'CON'
######
#Author: wdecoster
#Email: wouter.decoster AT molgen DOT vib-ua DOT be
#Usage: python Script2Extract.py valuesfile.txt intervalsfile.txt
#Written for https://www.biostars.org/p/228655/
#For now unclear whether intervals have to be inclusive or exclusive begin and end position
#wdecoster
import sys
with open(sys.argv[1]) as data:
#datadict = {line.split('\t')[1] : (line.split('\t')[2], line.strip().split('\t')[3]) for line in data if not line == ""} #Create a dictionary mapping the position to the values
datadict = {}
library(ggplot2)
dups = read.table("coordinates.duplicates1")
names(dups) <- c("X", "Y")
uniq = read.table("coordinates.unique1")
names(uniq) <- names(dups)
df <- rbind(cbind(group= "unique", uniq), cbind(group= "dups", dups))
scatter <-ggplot(data=df, aes(x=X, y=Y)) + geom_point(aes(colour = group), size=0.1)
ggsave(file="DuplicatesNextSeq_Scatter.jpeg", scatter)