Last active
October 21, 2024 07:50
-
-
Save slowkow/d3c4b77c9bf2a75f6dad4843d7d3aefc to your computer and use it in GitHub Desktop.
Read Cell Ranger HDF5 .h5 files in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# install.packages(c("Matrix", "rhdf5", "tidyverse")) | |
library(Matrix) | |
library(rhdf5) | |
library(tidyverse) | |
library(glue) | |
my_h5_files <- Sys.glob( | |
"path/to/cellranger-per-channel/output/*/filtered_feature_bc_matrix.h5" | |
) | |
my_counts_list <- lapply(my_h5_files, function(my_h5_file) { | |
# Each cell gets a barcode nucleotide sequence. | |
# e.g. "AAACCTGAGACCGGAT-1" | |
my_barcodes <- as.character(h5read(my_h5_file, "matrix/barcodes")) | |
# We'll assume that the name of the folder containing the .h5 file | |
# is the name of the channel (sample). | |
my_channel <- basename(dirname(my_h5_file)) | |
# Prepend the sample name to the barcodes, so we can safely | |
# merge multiple samples into a single matrix. | |
my_barcodes <- sprintf("%s|%s", my_channel, my_barcodes) | |
# Read the data into a dgCMatrix (sparse matrix). | |
h5 <- h5read(my_h5_file, "matrix") | |
counts <- sparseMatrix( | |
dims = h5$shape, | |
i = as.numeric(h5$indices), | |
p = as.numeric(h5$indptr), | |
x = as.numeric(h5$data), | |
index1 = FALSE | |
) | |
colnames(counts) <- my_barcodes | |
rownames(counts) <- as.data.frame(h5[["features"]])$id | |
return(counts) | |
}) | |
# Make sure all the count matrices have the same dimensions | |
stopifnot(length(unique(lapply(my_counts_list, dim))) == 1) | |
# Concatenate the matrices into one big matrix. | |
counts <- do.call(cbind, my_counts_list) | |
rm(my_counts_list) | |
# Compute the sparsity of the matrix. | |
sparsity <- 1 - length(counts@x) / (as.numeric(nrow(counts)) * ncol(counts)) | |
n_genes <- nrow(counts) | |
n_cells <- ncol(counts) | |
print(glue("Counts matrix has {comma(n_genes)} genes and {comma(n_cells)} cells")) | |
print(glue("Counts matrix is {signif(100 * sparsity, 3)}% sparse")) | |
# We'll also want to get the corresponding gene information from the .h5 file. | |
genes <- h5read(my_h5_files[1], "matrix/features") | |
genes <- tibble(ensembl_id = genes$id, symbol = genes$name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment