Skip to content

Instantly share code, notes, and snippets.

@flying-sheep
Created February 15, 2019 17:18
Show Gist options
  • Select an option

  • Save flying-sheep/ba12b71f0424f84dda9ed188fd48cfcc to your computer and use it in GitHub Desktop.

Select an option

Save flying-sheep/ba12b71f0424f84dda9ed188fd48cfcc to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- https://rpy2.readthedocs.io/en/latest/generated_rst/s4class.html#custom-conversion\n",
"- https://bitbucket.org/rpy2/rpy2/src/default/rpy/robjects/numpy2ri.py\n",
"- https://bitbucket.org/rpy2/rpy2/src/default/rpy/robjects/pandas2ri.py"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import rpy2\n",
"assert rpy2.__version_vector__[0] >= (3, 0)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# anndata2ri.py\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from anndata import AnnData\n",
"\n",
"from rpy2.robjects import conversion, default_converter, numpy2ri, pandas2ri\n",
"from rpy2.rinterface import NULL, Sexp, RTYPES\n",
"from rpy2.robjects.vectors import Matrix, FloatMatrix\n",
"from rpy2.robjects.methods import RS4\n",
"\n",
"\n",
"converter = conversion.Converter('original anndata conversion')\n",
"\n",
"\n",
"# Python to R\n",
"\n",
"\n",
"@converter.py2rpy.register(AnnData)\n",
"def py2rpy_anndata(obj) -> RS4:\n",
" # TODO\n",
" return RS4(...)\n",
"\n",
"\n",
"# R to Python\n",
"\n",
"\n",
"@numpy2ri.rpy2py.register(Sexp) # No idea why this is necessary. I’d think the dispatcher catches this earlier\n",
"@converter.rpy2py.register(Matrix)\n",
"@converter.rpy2py.register(FloatMatrix)\n",
"def rpy2py_matrix_ad(obj):\n",
" \"\"\"\n",
" For some reason the original matrix conversion is dog slow.\n",
" Using memoryview fixes that.\n",
" \"\"\"\n",
" if obj.typeof in numpy2ri._vectortypes and obj.typeof != RTYPES.VECSXP:\n",
" if hasattr(obj, 'memoryview'):\n",
" res = np.asarray(obj.memoryview())\n",
" else:\n",
" res = np.asarray(obj)\n",
" else:\n",
" res = default_converter.rpy2py(obj)\n",
" return res\n",
"\n",
"\n",
"@converter.rpy2py.register(RS4)\n",
"def rpy2py_s4(obj):\n",
" \"\"\"\n",
" See here for the slots: https://bioconductor.org/packages/release/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html\n",
" \"\"\"\n",
" if 'DataFrame' in obj.rclass:\n",
" return rpy2py_data_frame(obj)\n",
" elif 'SingleCellExperiment' in obj.rclass:\n",
" return rpy2py_single_cell_experiment(obj)\n",
"\n",
"\n",
"def rpy2py_data_frame(obj):\n",
" \"\"\"\n",
" S4 DataFrame class, not data.frame\n",
" \"\"\"\n",
" res = pd.DataFrame.from_dict({\n",
" k: converter.rpy2py(v) if isinstance(v, Sexp) else v\n",
" for k, v in obj.slots['listData'].items()\n",
" })\n",
" rownames = obj.slots['rownames']\n",
" if rownames is not NULL:\n",
" res.index = rownames\n",
" return res\n",
"\n",
"\n",
"def rpy2py_single_cell_experiment(obj):\n",
" se = importr('SummarizedExperiment')\n",
" sce = importr('SingleCellExperiment')\n",
" \n",
" assay_names = se.assayNames(obj)\n",
" if assay_names is not NULL:\n",
" # The assays can be stored in an env or elsewise so we don’t use obj.slots['assays']\n",
" assays = [se.assay(obj, str(n)).T for n in assay_names]\n",
" # There’s SingleCellExperiment with no assays\n",
" exprs, layers = assays[0], dict(zip(assay_names[1:], assays[1:]))\n",
" assert len(exprs.shape) == 2, exprs.shape\n",
" else:\n",
" exprs, layers = None, {}\n",
" \n",
" obs = converter.rpy2py(se.colData(obj))\n",
" assert isinstance(obs, pd.DataFrame), type(obs)\n",
" try: # TODO: why does this happen?\n",
" var = converter.rpy2py(se.rowData(obj))\n",
" assert isinstance(var, pd.DataFrame), type(var)\n",
" except Exception as e:\n",
" import traceback\n",
" traceback.print_exc()\n",
" var = None\n",
" \n",
" # TODO: se.metadata, se.dimnames\n",
" \n",
" return AnnData(exprs, obs, var, layers=layers)\n",
"\n",
"\n",
"# Activation / deactivation\n",
"\n",
"\n",
"# This is so that we can reexecute the module/cell without destroying this variable\n",
"if 'original_converter' not in globals():\n",
" original_converter = None\n",
"\n",
"\n",
"def activate():\n",
" global original_converter\n",
" if original_converter is not None:\n",
" return\n",
"\n",
" original_converter = conversion.converter\n",
" pandas2ri.activate()\n",
" new_converter = conversion.Converter('anndata conversion', template=conversion.converter)\n",
" pandas2ri.deactivate()\n",
"\n",
" for k, v in converter.py2rpy.registry.items():\n",
" if k is not object:\n",
" new_converter.py2rpy.register(k, v)\n",
"\n",
" for k, v in converter.rpy2py.registry.items():\n",
" if k is not object:\n",
" new_converter.rpy2py.register(k, v)\n",
"\n",
" conversion.set_conversion(new_converter)\n",
"\n",
"\n",
"def deactivate():\n",
" global original_converter\n",
" if original_converter is not None:\n",
" conversion.set_conversion(original_converter)\n",
" original_converter = None"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SCE Slots : int_metadata reducedDims int_elementMetadata int_colData rowRanges colData assays NAMES elementMetadata metadata class\n",
"DataFrame Slots: rownames nrows listData elementType elementMetadata metadata class\n",
"\n",
"SE Module: Assays KALLISTO_ASSAYS SummarizedExperiment assay assayNames assays colData f geneRangeMapper identicalVals makeSummarizedExperimentFromDataFrame makeSummarizedExperimentFromExpressionSet makeSummarizedExperimentFromLoom naiveRangeMapper new_SummarizedExperiment probeRangeMapper readKallisto readKallistoBootstrap rowData\n",
"SCE Module: GET_FUN LinearEmbeddingMatrix SET_FUN SingleCellExperiment as_matrix_LinearEmbeddingMatrix clearSizeFactors clearSpikes cpm factorData featureLoadings int_colData int_elementMetadata int_metadata isSpike logcounts normcounts objectVersion reducedDim reducedDimNames reducedDims sampleFactors scat sig sizeFactorNames spikeNames tpm\n"
]
}
],
"source": [
"from rpy2.robjects import r\n",
"from rpy2.robjects.packages import importr\n",
"\n",
"# data\n",
"se = importr('SummarizedExperiment')\n",
"sce = importr('SingleCellExperiment')\n",
"sc_rna_seq = importr('scRNAseq')\n",
"r.data('allen')\n",
"\n",
"ex_empty = sce.SingleCellExperiment()\n",
"ex_allen = r['as'](r.allen, 'SingleCellExperiment')\n",
"\n",
"# Things to work with\n",
"_obs_allen = se.colData(ex_allen)\n",
"print('SCE Slots :', *ex_empty.slots.keys())\n",
"print('DataFrame Slots:', *_obs_allen.slots.keys())\n",
"print()\n",
"print('SE Module:', *(f for f in dir(se) if not f[0] == '_' and not f[-2:] == '<-'))\n",
"print('SCE Module:', *(f for f in dir(sce) if not f[0] == '_' and not f[-2:] == '<-'))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"anndata conversion ('numpy conversion', 'snapshot before pandas conversion', 'anndata conversion')\n",
"AnnData object with n_obs × n_vars = 0 × 0 \n",
"AnnData object with n_obs × n_vars = 379 × 20908 \n",
" obs: 'NREADS', 'NALIGNED', 'RALIGN', 'TOTAL_DUP', 'PRIMER', 'PCT_RIBOSOMAL_BASES', 'PCT_CODING_BASES', 'PCT_UTR_BASES', 'PCT_INTRONIC_BASES', 'PCT_INTERGENIC_BASES', 'PCT_MRNA_BASES', 'MEDIAN_CV_COVERAGE', 'MEDIAN_5PRIME_BIAS', 'MEDIAN_3PRIME_BIAS', 'MEDIAN_5PRIME_TO_3PRIME_BIAS', 'driver_1_s', 'dissection_s', 'Core.Type', 'Primary.Type', 'Secondary.Type', 'Animal.ID', 'passes_qc_checks_s'\n",
" layers: 'cufflinks_fpkm', 'rsem_counts', 'rsem_tpm'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Traceback (most recent call last):\n",
" File \"<ipython-input-2-d30490b2b06f>\", line 88, in rpy2py_single_cell_experiment\n",
" var = converter.rpy2py(se.rowData(obj))\n",
" File \"/usr/lib/python3.7/functools.py\", line 824, in wrapper\n",
" return dispatch(args[0].__class__)(*args, **kw)\n",
" File \"<ipython-input-2-d30490b2b06f>\", line 52, in rpy2py_s4\n",
" return rpy2py_data_frame(obj)\n",
" File \"<ipython-input-2-d30490b2b06f>\", line 67, in rpy2py_data_frame\n",
" res.index = rownames\n",
" File \"/usr/lib/python3.7/site-packages/pandas/core/generic.py\", line 5080, in __setattr__\n",
" return object.__setattr__(self, name, value)\n",
" File \"pandas/_libs/properties.pyx\", line 69, in pandas._libs.properties.AxisProperty.__set__\n",
" File \"/usr/lib/python3.7/site-packages/pandas/core/generic.py\", line 638, in _set_axis\n",
" self._data.set_axis(axis, labels)\n",
" File \"/usr/lib/python3.7/site-packages/pandas/core/internals/managers.py\", line 155, in set_axis\n",
" 'values have {new} elements'.format(old=old_len, new=new_len))\n",
"ValueError: Length mismatch: Expected axis has 0 elements, new values have 20908 elements\n"
]
}
],
"source": [
"# For development, let’s replace this every time\n",
"deactivate()\n",
"assert conversion.converter.name == 'base empty converter'\n",
"activate()\n",
"print(conversion.converter.name, conversion.converter.lineage)#, *conversion.converter.rpy2py.registry.items(), sep='\\n')\n",
"\n",
"for ex in [ex_empty, ex_allen]:\n",
" ad = converter.rpy2py(ex)\n",
" print(ad)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment