Created
February 15, 2019 17:18
-
-
Save flying-sheep/ba12b71f0424f84dda9ed188fd48cfcc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- https://rpy2.readthedocs.io/en/latest/generated_rst/s4class.html#custom-conversion\n", | |
| "- https://bitbucket.org/rpy2/rpy2/src/default/rpy/robjects/numpy2ri.py\n", | |
| "- https://bitbucket.org/rpy2/rpy2/src/default/rpy/robjects/pandas2ri.py" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import rpy2\n", | |
| "assert rpy2.__version_vector__[0] >= (3, 0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# anndata2ri.py\n", | |
| "\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "from anndata import AnnData\n", | |
| "\n", | |
| "from rpy2.robjects import conversion, default_converter, numpy2ri, pandas2ri\n", | |
| "from rpy2.rinterface import NULL, Sexp, RTYPES\n", | |
| "from rpy2.robjects.vectors import Matrix, FloatMatrix\n", | |
| "from rpy2.robjects.methods import RS4\n", | |
| "\n", | |
| "\n", | |
| "converter = conversion.Converter('original anndata conversion')\n", | |
| "\n", | |
| "\n", | |
| "# Python to R\n", | |
| "\n", | |
| "\n", | |
| "@converter.py2rpy.register(AnnData)\n", | |
| "def py2rpy_anndata(obj) -> RS4:\n", | |
| " # TODO\n", | |
| " return RS4(...)\n", | |
| "\n", | |
| "\n", | |
| "# R to Python\n", | |
| "\n", | |
| "\n", | |
| "@numpy2ri.rpy2py.register(Sexp) # No idea why this is necessary. I’d think the dispatcher catches this earlier\n", | |
| "@converter.rpy2py.register(Matrix)\n", | |
| "@converter.rpy2py.register(FloatMatrix)\n", | |
| "def rpy2py_matrix_ad(obj):\n", | |
| " \"\"\"\n", | |
| " For some reason the original matrix conversion is dog slow.\n", | |
| " Using memoryview fixes that.\n", | |
| " \"\"\"\n", | |
| " if obj.typeof in numpy2ri._vectortypes and obj.typeof != RTYPES.VECSXP:\n", | |
| " if hasattr(obj, 'memoryview'):\n", | |
| " res = np.asarray(obj.memoryview())\n", | |
| " else:\n", | |
| " res = np.asarray(obj)\n", | |
| " else:\n", | |
| " res = default_converter.rpy2py(obj)\n", | |
| " return res\n", | |
| "\n", | |
| "\n", | |
| "@converter.rpy2py.register(RS4)\n", | |
| "def rpy2py_s4(obj):\n", | |
| " \"\"\"\n", | |
| " See here for the slots: https://bioconductor.org/packages/release/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html\n", | |
| " \"\"\"\n", | |
| " if 'DataFrame' in obj.rclass:\n", | |
| " return rpy2py_data_frame(obj)\n", | |
| " elif 'SingleCellExperiment' in obj.rclass:\n", | |
| " return rpy2py_single_cell_experiment(obj)\n", | |
| "\n", | |
| "\n", | |
| "def rpy2py_data_frame(obj):\n", | |
| " \"\"\"\n", | |
| " S4 DataFrame class, not data.frame\n", | |
| " \"\"\"\n", | |
| " res = pd.DataFrame.from_dict({\n", | |
| " k: converter.rpy2py(v) if isinstance(v, Sexp) else v\n", | |
| " for k, v in obj.slots['listData'].items()\n", | |
| " })\n", | |
| " rownames = obj.slots['rownames']\n", | |
| " if rownames is not NULL:\n", | |
| " res.index = rownames\n", | |
| " return res\n", | |
| "\n", | |
| "\n", | |
| "def rpy2py_single_cell_experiment(obj):\n", | |
| " se = importr('SummarizedExperiment')\n", | |
| " sce = importr('SingleCellExperiment')\n", | |
| " \n", | |
| " assay_names = se.assayNames(obj)\n", | |
| " if assay_names is not NULL:\n", | |
| " # The assays can be stored in an env or elsewise so we don’t use obj.slots['assays']\n", | |
| " assays = [se.assay(obj, str(n)).T for n in assay_names]\n", | |
| " # There’s SingleCellExperiment with no assays\n", | |
| " exprs, layers = assays[0], dict(zip(assay_names[1:], assays[1:]))\n", | |
| " assert len(exprs.shape) == 2, exprs.shape\n", | |
| " else:\n", | |
| " exprs, layers = None, {}\n", | |
| " \n", | |
| " obs = converter.rpy2py(se.colData(obj))\n", | |
| " assert isinstance(obs, pd.DataFrame), type(obs)\n", | |
| " try: # TODO: why does this happen?\n", | |
| " var = converter.rpy2py(se.rowData(obj))\n", | |
| " assert isinstance(var, pd.DataFrame), type(var)\n", | |
| " except Exception as e:\n", | |
| " import traceback\n", | |
| " traceback.print_exc()\n", | |
| " var = None\n", | |
| " \n", | |
| " # TODO: se.metadata, se.dimnames\n", | |
| " \n", | |
| " return AnnData(exprs, obs, var, layers=layers)\n", | |
| "\n", | |
| "\n", | |
| "# Activation / deactivation\n", | |
| "\n", | |
| "\n", | |
| "# This is so that we can reexecute the module/cell without destroying this variable\n", | |
| "if 'original_converter' not in globals():\n", | |
| " original_converter = None\n", | |
| "\n", | |
| "\n", | |
| "def activate():\n", | |
| " global original_converter\n", | |
| " if original_converter is not None:\n", | |
| " return\n", | |
| "\n", | |
| " original_converter = conversion.converter\n", | |
| " pandas2ri.activate()\n", | |
| " new_converter = conversion.Converter('anndata conversion', template=conversion.converter)\n", | |
| " pandas2ri.deactivate()\n", | |
| "\n", | |
| " for k, v in converter.py2rpy.registry.items():\n", | |
| " if k is not object:\n", | |
| " new_converter.py2rpy.register(k, v)\n", | |
| "\n", | |
| " for k, v in converter.rpy2py.registry.items():\n", | |
| " if k is not object:\n", | |
| " new_converter.rpy2py.register(k, v)\n", | |
| "\n", | |
| " conversion.set_conversion(new_converter)\n", | |
| "\n", | |
| "\n", | |
| "def deactivate():\n", | |
| " global original_converter\n", | |
| " if original_converter is not None:\n", | |
| " conversion.set_conversion(original_converter)\n", | |
| " original_converter = None" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "SCE Slots : int_metadata reducedDims int_elementMetadata int_colData rowRanges colData assays NAMES elementMetadata metadata class\n", | |
| "DataFrame Slots: rownames nrows listData elementType elementMetadata metadata class\n", | |
| "\n", | |
| "SE Module: Assays KALLISTO_ASSAYS SummarizedExperiment assay assayNames assays colData f geneRangeMapper identicalVals makeSummarizedExperimentFromDataFrame makeSummarizedExperimentFromExpressionSet makeSummarizedExperimentFromLoom naiveRangeMapper new_SummarizedExperiment probeRangeMapper readKallisto readKallistoBootstrap rowData\n", | |
| "SCE Module: GET_FUN LinearEmbeddingMatrix SET_FUN SingleCellExperiment as_matrix_LinearEmbeddingMatrix clearSizeFactors clearSpikes cpm factorData featureLoadings int_colData int_elementMetadata int_metadata isSpike logcounts normcounts objectVersion reducedDim reducedDimNames reducedDims sampleFactors scat sig sizeFactorNames spikeNames tpm\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from rpy2.robjects import r\n", | |
| "from rpy2.robjects.packages import importr\n", | |
| "\n", | |
| "# data\n", | |
| "se = importr('SummarizedExperiment')\n", | |
| "sce = importr('SingleCellExperiment')\n", | |
| "sc_rna_seq = importr('scRNAseq')\n", | |
| "r.data('allen')\n", | |
| "\n", | |
| "ex_empty = sce.SingleCellExperiment()\n", | |
| "ex_allen = r['as'](r.allen, 'SingleCellExperiment')\n", | |
| "\n", | |
| "# Things to work with\n", | |
| "_obs_allen = se.colData(ex_allen)\n", | |
| "print('SCE Slots :', *ex_empty.slots.keys())\n", | |
| "print('DataFrame Slots:', *_obs_allen.slots.keys())\n", | |
| "print()\n", | |
| "print('SE Module:', *(f for f in dir(se) if not f[0] == '_' and not f[-2:] == '<-'))\n", | |
| "print('SCE Module:', *(f for f in dir(sce) if not f[0] == '_' and not f[-2:] == '<-'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "anndata conversion ('numpy conversion', 'snapshot before pandas conversion', 'anndata conversion')\n", | |
| "AnnData object with n_obs × n_vars = 0 × 0 \n", | |
| "AnnData object with n_obs × n_vars = 379 × 20908 \n", | |
| " obs: 'NREADS', 'NALIGNED', 'RALIGN', 'TOTAL_DUP', 'PRIMER', 'PCT_RIBOSOMAL_BASES', 'PCT_CODING_BASES', 'PCT_UTR_BASES', 'PCT_INTRONIC_BASES', 'PCT_INTERGENIC_BASES', 'PCT_MRNA_BASES', 'MEDIAN_CV_COVERAGE', 'MEDIAN_5PRIME_BIAS', 'MEDIAN_3PRIME_BIAS', 'MEDIAN_5PRIME_TO_3PRIME_BIAS', 'driver_1_s', 'dissection_s', 'Core.Type', 'Primary.Type', 'Secondary.Type', 'Animal.ID', 'passes_qc_checks_s'\n", | |
| " layers: 'cufflinks_fpkm', 'rsem_counts', 'rsem_tpm'\n" | |
| ] | |
| }, | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "Traceback (most recent call last):\n", | |
| " File \"<ipython-input-2-d30490b2b06f>\", line 88, in rpy2py_single_cell_experiment\n", | |
| " var = converter.rpy2py(se.rowData(obj))\n", | |
| " File \"/usr/lib/python3.7/functools.py\", line 824, in wrapper\n", | |
| " return dispatch(args[0].__class__)(*args, **kw)\n", | |
| " File \"<ipython-input-2-d30490b2b06f>\", line 52, in rpy2py_s4\n", | |
| " return rpy2py_data_frame(obj)\n", | |
| " File \"<ipython-input-2-d30490b2b06f>\", line 67, in rpy2py_data_frame\n", | |
| " res.index = rownames\n", | |
| " File \"/usr/lib/python3.7/site-packages/pandas/core/generic.py\", line 5080, in __setattr__\n", | |
| " return object.__setattr__(self, name, value)\n", | |
| " File \"pandas/_libs/properties.pyx\", line 69, in pandas._libs.properties.AxisProperty.__set__\n", | |
| " File \"/usr/lib/python3.7/site-packages/pandas/core/generic.py\", line 638, in _set_axis\n", | |
| " self._data.set_axis(axis, labels)\n", | |
| " File \"/usr/lib/python3.7/site-packages/pandas/core/internals/managers.py\", line 155, in set_axis\n", | |
| " 'values have {new} elements'.format(old=old_len, new=new_len))\n", | |
| "ValueError: Length mismatch: Expected axis has 0 elements, new values have 20908 elements\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# For development, let’s replace this every time\n", | |
| "deactivate()\n", | |
| "assert conversion.converter.name == 'base empty converter'\n", | |
| "activate()\n", | |
| "print(conversion.converter.name, conversion.converter.lineage)#, *conversion.converter.rpy2py.registry.items(), sep='\\n')\n", | |
| "\n", | |
| "for ex in [ex_empty, ex_allen]:\n", | |
| " ad = converter.rpy2py(ex)\n", | |
| " print(ad)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment