I have reused the code enough to make a package out of it.
The python package is deposited at Github. with an example in Jupyter notebook.
| import pandas as pd | |
| import rpy2.robjects as robjects | |
| from rpy2.robjects import pandas2ri, Formula | |
| pandas2ri.activate() | |
| from rpy2.robjects.packages import importr | |
| deseq = importr('DESeq2') | |
| ''' | |
| Adopted from: https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2 | |
| ''' | |
| to_dataframe = robjects.r('function(x) data.frame(x)') | |
| class py_DESeq2: | |
| ''' | |
| DESeq2 object through rpy2 | |
| input: | |
| count_matrix: should be a pandas dataframe with each column as count, and a id column for gene id | |
| example: | |
| id sampleA sampleB | |
| geneA 5 1 | |
| geneB 4 5 | |
| geneC 1 2 | |
| design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames | |
| treatment | |
| sampleA1 A | |
| sampleA2 A | |
| sampleB1 B | |
| sampleB2 B | |
| design_formula: see DESeq2 manual, example: "~ treatment"" | |
| gene_column: column name of gene id columns, exmplae "id" | |
| ''' | |
| def __init__(self, count_matrix, design_matrix, design_formula, gene_column='id'): | |
| try: | |
| assert gene_column in count_matrix.columns, 'Wrong gene id column name' | |
| gene_id = count_matrix[gene_column] | |
| except AttributeError: | |
| sys.exit('Wrong Pandas dataframe?') | |
| self.dds = None | |
| self.deseq_result = None | |
| self.resLFC = None | |
| self.comparison = None | |
| self.normalized_count_matrix = None | |
| self.gene_column = gene_column | |
| self.gene_id = count_matrix[self.gene_column] | |
| self.count_matrix = pandas2ri.py2ri(count_matrix.drop(gene_column,axis=1)) | |
| self.design_matrix = pandas2ri.py2ri(design_matrix) | |
| self.design_formula = Formula(design_formula) | |
| def run_deseq(self, **kwargs): | |
| self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix, | |
| colData=self.design_matrix, | |
| design=self.design_formula) | |
| self.dds = deseq.DESeq(self.dds, **kwargs) | |
| self.normalized_count_matrix = deseq.counts(self.dds, normalized=True) | |
| def get_deseq_result(self, **kwargs): | |
| self.comparison = deseq.resultsNames(self.dds) | |
| self.deseq_result = deseq.results(self.dds, **kwargs) | |
| self.deseq_result = to_dataframe(self.deseq_result) | |
| self.deseq_result = pandas2ri.ri2py(self.deseq_result) ## back to pandas dataframe | |
| self.deseq_result[self.gene_column] = self.gene_id.values |
| from __future__ import print_function | |
| import pandas as pd | |
| import rpy2.robjects as robjects | |
| from rpy2.robjects import pandas2ri, Formula | |
| pandas2ri.activate() | |
| from rpy2.robjects.packages import importr | |
| dexseq = importr('DEXSeq') | |
| bp = importr('BiocParallel') | |
| ''' | |
| Adopted from: https://stackoverflow.com/questions/41821100/running-deseq2-through-rpy2 | |
| ''' | |
| to_dataframe = robjects.r('function(x) data.frame(x)') | |
| class py_DEXSeq: | |
| ''' | |
| DEXSeq2 object through rpy2 | |
| input: | |
| count_matrix: should be a pandas dataframe with each column as count, and a id column for exon id | |
| example: | |
| exonID sampleA sampleB | |
| geneA 5 1 | |
| geneB 4 5 | |
| geneC 1 2 | |
| design_matrix: an design matrix in the form of pandas dataframe, see DESeq2 manual, samplenames as rownames | |
| treatment | |
| sampleA1 A | |
| sampleA2 A | |
| sampleB1 B | |
| sampleB2 B | |
| design_formula: see DEXSeq manual, example: "~ sample + exon + exon:treatment"" | |
| feature_column: column name of exon id columns, example "id" | |
| var_column: will pass to fitExpToVar for DEXSeq exon fold change | |
| exons: exon id | |
| genes: gene id for dexseq grouping | |
| threads: number of threads to use | |
| ''' | |
| def __init__(self, count_matrix, design_matrix, design_formula, | |
| feature_column='id', var_column = 'condition', | |
| exons=None, genes=None, threads=1): | |
| try: | |
| assert feature_column in count_matrix.columns, 'Wrong gene id column name' | |
| assert var_column in design_matrix.columns, 'Wrong var column for DEXSeq' | |
| except AttributeError: | |
| sys.exit('Wrong Pandas dataframe?') | |
| self.dxd = None | |
| self.dxd_res = None | |
| self.dexseq_result = None | |
| self.comparison = None | |
| self.normalized_count_matrix = None | |
| self.feature_column = feature_column | |
| self.exons = exons | |
| self.genes = genes | |
| self.gene_id = count_matrix[self.feature_column] | |
| self.count_matrix = pandas2ri.py2ri(count_matrix.drop(feature_column,axis=1)) | |
| self.design_matrix = pandas2ri.py2ri(design_matrix) | |
| self.design_formula = Formula(design_formula) | |
| self.BPPARAM = bp.MulticoreParam(workers=threads) | |
| self.var_column = var_column | |
| def run_dexseq(self, **kwargs): | |
| self.dxd = dexseq.DEXSeqDataSet(countData=self.count_matrix, | |
| sampleData=self.design_matrix, | |
| design=self.design_formula, | |
| featureID = self.exons, | |
| groupID = self.genes) | |
| print('Constructed DXD object') | |
| self.dxd = dexseq.estimateSizeFactors_DEXSeqDataSet(self.dxd) | |
| self.dxd = dexseq.estimateDispersions_DEXSeqDataSet(self.dxd, BPPARAM=self.BPPARAM) | |
| print('Starting DEXSeq test') | |
| self.dxd = dexseq.testForDEU(self.dxd, BPPARAM=self.BPPARAM) | |
| self.dxd = dexseq.estimateExonFoldChanges(self.dxd, | |
| fitExpToVar=self.var_column, | |
| BPPARAM=self.BPPARAM) | |
| print('Finished DEXSeq fold change') | |
| def get_dexseq_result(self, **kwargs): | |
| self.dexseq_result = to_dataframe(dexseq.DEXSeqResults(self.dxd), **kwargs) | |
| self.dexseq_result = pandas2ri.ri2py(self.dexseq_result) ## back to pandas dataframe | |
| self.dexseq_result['exons'] = self.exons | |
| self.dexseq_result['genes'] = self.genes | |
| self.dexseq_result.drop('genomicData', axis=1, inplace=True) | |
| def normalized_count(self): | |
| self.normalized_count_matrix = dexseq.counts(self.dxd,normalized=True) | |
| return normalized_count_matrix |
Hi @borisstojilkovic, I'm sorry it didn't work for you out-of-the-box! I'm not exactly sure what the problem is from the error message, but I did spot some differences between the gist version and the current implementation here that eliminate the use of resultsNames in get_deseq_result. The gist version here is a little out-of-date, maybe go ahead and use the maintained code in diffexpr would help! please let me know if that works, I'd be interested whether it works on a windows machine!
Hi, I tried to run your code (deseq) on example data, but it seems that I am getting an error. Do you know how to solve this?