#include "../external/pthash/external/essentials/include/essentials.hpp"
#include "../include/dictionary.hpp"
#include "../include/query/streaming_query_canonical_parsing.hpp"
#include "../include/util.hpp"
#include <sstream>
namespace piscem {
rob-p /
Created January 4, 2025 03:03
use memchr
fn create_record(line: &str) -> Record {
let finder = memchr::memmem::Finder::new("bc");
let mut iter = line.split('\t').peekable();
let name = iter.peek().unwrap().to_string();
let count = iter
.filter(|s| finder.find(s[1..4].as_bytes()).is_some())
Record::new(name, count)
rob-p /
Created October 26, 2024 14:44
parallel decompression speed test
# This is a rather minimal example Argbash potential
# Example taken from
# ARG_OPTIONAL_BOOLEAN([delete-tmp],[],[delete output directory])
# ARG_POSITIONAL_SINGLE([num-threads],[number of threads to run with],[])
# ARG_POSITIONAL_SINGLE([input],[input file list],[])
# ARG_HELP([Test how long it takes to decompress many files in parallel])
use anyhow;
use ndarray::Array2;
use std::fs;
use std::cmp::Ordering;
fn naive_table(text: &str) -> Vec<u32> {
let text = text.as_bytes();
assert!(text.len() <= u32::MAX as usize);
let mut table = vec![0u32; text.len()];
for (i, element) in table.iter_mut().enumerate() {
rob-p /
Created June 5, 2024 15:42
Decompression speeds with some different libraries / interfaces
$ uname -a
Linux fern 6.2.0-33-generic #33~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Sep  7 10:33:52 UTC 2 x86_64 x86_64 x86_64 GNU/Linux


$ zcat data/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr21.recalibrated_variants.vcf.gz|head -c $(( 10 * 1024 * 1024 * 1024 )) | pv >/dev/null
10.0GiB 0:00:47 [ 216MiB/s] [                                                                                                                                                           <=>       ]
rob-p /
Created February 4, 2024 16:16
A simple builder in rust (using the `derive_builder` crate).
extern crate derive_builder;
#[derive(Default, Builder, Debug)]
struct House {
floors: u32,
rooms: u32,
#[builder(default = "false")]
has_garage: bool,
rob-p /
Last active July 10, 2023 19:56
Rust Iterator Question

Note: This is cross-posted from reddit.

I've been trying to determine (a) if it's possible to achieve something in rust and (b) if so, how. I will try to abstract the problem as much as possible since the details of this in my code are rather boring and unessential to describe the problem.

I have a program that extracts information from the combination of a polars data frame, and a paired file. This is for a genomics application, and for those interested, the data frame contains the locations of sequence features (exons, transcripts, etc.) and the file contains the genome sequence (chromosome by chromosome). The genome is large, so we prefer not to load the whole thing in memory, and instead to iterate over it chromosome by chromosome and then feature by feature, yielding each sequence feature one at a time.

It turns out it's relatively simple to write a "single sequence" iterator (i.e. an iterato

use antisequence::*;
fn main() {
// 1{b[9-10]f[CAGAGC]u[8]b[10]}2{r:}
let patterns = r#"
name: anchor
- pattern: "CAGAGC"
rob-p / avx2_lcp2.cpp
Created April 25, 2023 03:13
AVX2 LCP take 2
#include <immintrin.h>
const char* longest_common_prefix(const char* str1, const char* str2, int len) {
int i = 0;
for (; i <= len - 32; i += 32) {
__m256i v1 = _mm256_loadu_si256((__m256i*)(str1 + i));
__m256i v2 = _mm256_loadu_si256((__m256i*)(str2 + i));
__m256i cmp = _mm256_cmpeq_epi8(v1, v2);
int mask = _mm256_movemask_epi8(cmp);
if (mask != 0xFFFFFFFF) {
rob-p / avx2_lcp.cpp
Last active April 25, 2023 03:05
#include <immintrin.h> // for AVX2 intrinsics
#include <cstring> // for strlen
size_t longestCommonPrefixAVX2(const char* str1, const char* str2) {
const size_t len1 = strlen(str1);
const size_t len2 = strlen(str2);
const size_t len = std::min(len1, len2);
const __m256i* p1 = reinterpret_cast<const __m256i*>(str1);
const __m256i* p2 = reinterpret_cast<const __m256i*>(str2);