Skip to content

Instantly share code, notes, and snippets.

@bicycle1885
Created January 14, 2020 08:53
Show Gist options
  • Select an option

  • Save bicycle1885/8fe674ea085a7414ea4d6417be0a4a75 to your computer and use it in GitHub Desktop.

Select an option

Save bicycle1885/8fe674ea085a7414ea4d6417be0a4a75 to your computer and use it in GitHub Desktop.
Utilities to read output files of Alevin.
# Utilities to read output files of Alevin.
# -----------------------------------------
#
# Copyright 2020 Kenta Sato
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
using SparseArrays: sparse
using CodecZlib: GzipDecompressorStream
"""
load_alevin(path::AbstractString)
Load Alevin's output located at `path`.
`path` is a directory that contains the `alevin` directory, which contains the
results of quantification files (e.g., `quants_mat.gz`). The return value is a
named tuple with the following elements:
1. `rows`: cell barcodes
2. `cols`: transcript names
3. `whitelist`: cell barcodes that pass the whitelist criteria
4. `quants_mat`: quantification result (sparse matrix of `Float32`)
5. `quants_tier_mat`: quantification tier (sparse matrix of `UInt8`)
"""
function load_alevin(path::AbstractString)
alevinpath(name) = joinpath(path, "alevin", name)
rows = readlines(alevinpath("quants_mat_rows.txt"))
cols = readlines(alevinpath("quants_mat_cols.txt"))
whitelist = readlines(alevinpath("whitelist.txt"))
m, n = length.((rows, cols))
gzip(read, filepath) = open(s -> read(s, m, n), GzipDecompressorStream, filepath)
quants_mat = gzip(read_quants_mat, alevinpath("quants_mat.gz"))
quants_tier_mat = gzip(read_quants_tier_mat, alevinpath("quants_tier_mat.gz"))
return (
rows = rows,
cols = cols,
whitelist = whitelist,
quants_mat = quants_mat,
quants_tier_mat = quants_tier_mat,
)
end
read_quants_mat(input::IO, m::Int, n::Int) =
read_quants_sparse(Float32, input, m, n)
read_quants_tier_mat(input::IO, m::Int, n::Int) =
read_quants_sparse(UInt8, input, m, n)
function read_quants_sparse(::Type{T}, input::IO, m::Int, n::Int) where T
I = Int[]; J = Int[]; V = T[]
for i in 1:m
header = zeros(UInt8, cld(n, 8))
read!(input, header)
n_expressed = parse_header!(J, header)
append!(I, fill(i, n_expressed))
data = zeros(T, n_expressed)
read!(input, data)
append!(V, data)
end
return sparse(I, J, V, m, n)
end
function parse_header!(cols::Vector{Int}, header::Vector{UInt8})
n = length(cols)
j = 1
for x in header
for _ in 1:8
(x >> 7) != 0 && push!(cols, j)
x <<= 1
j += 1
end
end
return length(cols) - n
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment