Skip to content

Instantly share code, notes, and snippets.

View fulmicoton's full-sized avatar
🤪

Paul Masurel fulmicoton

🤪
View GitHub Profile
from heapq import heappush, heappop
"""
Our internal analytics system relies heavily on producing sorted streams of
(key: string, payload: bytearray).
In the following, I will forget the payload as they are not important here.
We will only consider a stream of sorted keys.
@fulmicoton
fulmicoton / fstbug.rs
Created June 26, 2017 05:18
Reproduction with the fst crate
#![feature(alloc_system)]
extern crate alloc_system;
extern crate fst;
use fst::set::* ;
struct A<'a> {
stream: StreamBuilder<'a>,
}
impl<'a> A<'a> {
#[derive(Default)]
pub struct Token {
pub offset_from: usize,
pub offset_to: usize,
pub position: usize,
pub term: String,
}
pub trait TokenStream {
@fulmicoton
fulmicoton / branchless.asm
Last active May 22, 2017 02:39
branchless binary search
.type _ZN8rust_out13binary_search17hd76cf76916892235E,@function_ZN8rust_out13binary_search17hd76cf76916892235E:
.cfi_startproc
movl 256(%rdi), %ecx
movl $64, %edx
xorl %eax, %eax
cmpl %ecx, %esi
cmovgeq %rdx, %rax
movl 128(%rdi,%rax,4), %ecx
leaq 32(%rax), %rdx
cmpl %ecx, %esi
let start = 0;
let end = NUM_DOCS_PER_BLOCK;
let mut half = NUM_DOCS_PER_BLOCK / 2;
for _ in 0..7 {
let mid = start + half;
let doc = block_docs[mid];
let cmp = (doc > target);
(start, end) =
if (doc <= target) {
(mid, end)
use tantivy::*;
use tantivy::postings::SkipResult;
fn extract_termfreq(searcher: &Searcher, term: &Term, doc_address: &DocAddress) -> Option<u32> {
searcher
.segment_reader(doc_address.segment_ord() as usize)
.read_postings(term, SegmentPostingsOption::Freq)
.and_then(|mut postings| {
if postings.skip_next(doc_address.doc()) == SkipResult::Reached {
Some(postings.term_freq())
import re
from collections import Counter, defaultdict
import hashlib
import time
from multiprocessing import Pool
WHITESPACE = re.compile("\W+")
def iter_shingles(desc):
md5 = hashlib.md5()
import random
from collections import defaultdict
from copy import deepcopy
import itertools
from math import log
c = 1.22179
def make_str_to_edge(salts, n_vertices):
import random
def rotate(circle):
return circle[1:] + circle[:1]
def _swap(circle_list, i, j):
val_i = circle_list[i]
val_j = circle_list[j]
circle_list[j] = val_i
circle_list[i] = val_j
import re
from collections import Counter, defaultdict
import hashlib
import time
WHITESPACE = re.compile("\W+")
def iter_shingles(desc):
md5 = hashlib.md5()