robert-king · July 26, 2024 08:54
diff --git a/cheating_detection.rs b/cheating_detection.rs
 /*
 - Blazing fast, highly accurate cheating detection algorithm
 - by Robert King, https://www.linkedin.com/in/robertkingnz/
 - Youtube walkthrough: https://youtu.be/CnIQkIseLGs
 - X https://x.com/robertkingNZ

 - BACKGROUND
 - The Google Code Jam team presented their 90% accurate algorithm after google code jam 2021.
 - During the contest, Rob discovered a ~100% accurate algorithm and so thought it was worth sharing.

 - THE PROBLEM (https://zibada.guru/gcj/2021qr/problems/#E):
 - A cheater flips a coin, if it's heads, they cheat, if it's tails they don't.
 - if they cheat their answer will be correct.
 - how can we catch the cheater?
 - Solutions need to be efficient as there's 100_000_000 quiz results.
 - Question difficulty and player skill level is evenly distributed
 - given boolean matrix of quiz results
 - matrix[i][j] == 1 with probability sigmoid(skill[i]-difficulty[j])


 - INNOVATIVE SOLUTION: (try solve it yourself first)
 - We can see the difficulty of a question by looking at percent of errors.
 - We know the cheater didn't cheat on the problems they got wrong
 - Therefore we can estimate a player's skill by looking at the average difficulty of the problems they got wrong.
 - with this proxy for skill, we can rank players exactly by skill (due to monotonically increasing curves)
 - e.g. if x < y, f(x) < f(y), so our sorting is accurate
 - after ranking by skill, we can compare a cheater to their neighbours to detect anomalies
 - the anomaly to look for is: the average difficulty of problems the cheater got correct is much higher relative to their neighbours
 - This is because cheating is more likely to flip results on harder problems, so it moves the average difficulty of correct answers higher.
 - we calculate this anomaly (cheating enhancement) for all players, and pick the max.


 - HIGH PERFORMANCE CODE:
 - Processes 50MB file in 0.26 seconds (100_000_000 quiz results) (my fast go lang implementation in the contest takes 9 seconds!)
 - Processes 1GB file in 0.4 seconds (1_000_000_000 quiz results) (just on my laptop)
 - uses a pool of buffers and a pool of ndarrays which are allocated up front and reused.
 - BufReader reads file at max speed in main thread, reading entire matrix at a time
 - As soon as a matrix is read, the next matrix is read (concurrently the previous buffer is processed by another thread)
 - pool items are CachePadded to the length of a cache line for efficient parallel computation.
 - the threadpool converts the buffer to an ndarray and passes the buffer back using sync_channel.
 - ndarrays are manipulated in place using SIMD, i.e. dot products, sums, inversions
 - SIMD allows a single CPU instruction to process multiple data points simultaneously
 - ndarray is passed back to the pool
 - matrix isn't large enough to make GPU acceleration worthwhile
 - no need to check for newline characters, these are treated as zero column in matrix and dont affect results.
 */

 use crossbeam_utils::CachePadded;
 use ndarray;
 use std::fs::File;
 use std::io::{BufRead, BufReader, Read};
 use std::sync::mpsc::{channel, sync_channel};
 use threadpool::ThreadPool;

 const N: usize = 100; // num people (per test case)
 const M: usize = 10_000; // num questions (per test case)
 const NDARRAY_POOL_SIZE: usize = 20;
 const BUFF_POOL_SIZE: usize = 5;
 const N_WORKERS: usize = 20;

 fn read_metadata(reader: &mut BufReader<File>) -> usize {
    let mut buf = String::new();
    reader.read_line(&mut buf).unwrap();
    let t: usize = buf.trim_end().parse().unwrap();
    buf.clear();
    reader.read_line(&mut buf).unwrap();
    let _accuracy: usize = buf.trim().parse().unwrap();
    buf.clear();
    t
 }

 #[derive(Default, Clone, Debug)]
 struct Person {
    id: usize,
    avg_right: f32, // on average how difficult were the problems they got right
    avg_wrong: f32, // on average how difficult were the problems they got wrong
    predicted_avg_right: f32, // ranked by avg_wrong, predict avg_right using neighbours
 }

 impl Person {
    fn cheating_enhancement(&self) -> f32 {
        // the avg difficulty of what they got right
        // minus
        // the avg difficulty of what their neighbours (ranked by average difficulty of what they got wrong) got right
        self.avg_right - self.predicted_avg_right
    }
 }

 fn main() {
    // let f = File::open("test_data/sample_test_set_1/sample_ts1_input.txt").unwrap();
    // let f = File::open("test_data/test_set_2/ts2_input.txt").unwrap();
    let f = File::open("tst.txt").unwrap();
    let mut reader = BufReader::new(f);
    let (matrix_tx, matrix_rx) = sync_channel(NDARRAY_POOL_SIZE);
    for matrix in
        vec![CachePadded::new(ndarray::Array2::<f32>::default((N, M + 1))); NDARRAY_POOL_SIZE]
    {
        matrix_tx.send(matrix).unwrap();
    }
    let (buff_tx, buff_rx) = sync_channel(BUFF_POOL_SIZE);
    for buff in vec![CachePadded::new(vec![0; N * (M + 1)]); BUFF_POOL_SIZE] {
        buff_tx.send(buff).unwrap();
    }
    let (result_tx, result_rx) = channel();
    let thread_pool = ThreadPool::new(N_WORKERS);
    let test_cases = read_metadata(&mut reader);
    for case_num in 1..=test_cases {
        let mut buf = buff_rx.recv().unwrap();
        reader.read_exact(buf.as_mut_slice()).unwrap();
        let buff_tx = buff_tx.clone();
        let matrix_tx = matrix_tx.clone();
        let result_tx = result_tx.clone();
        let mut matrix = matrix_rx.recv().unwrap();
        thread_pool.execute(move || {
            for (x, y) in matrix.iter_mut().zip(buf.iter()) {
                // y could be '1' | '0' | '\n'
                *x = y.saturating_sub(b'0') as f32;
            }
            let _ = buff_tx.send(buf); // done with buf
            let column_sums = matrix.sum_axis(ndarray::Axis(0)).map(|x| 1.0 - x) / (N as f32);
            let avg_right_dot_products: Vec<f32> = matrix
                .rows()
                .into_iter()
                .map(|row| row.dot(&column_sums) / row.sum())
                .collect();
            let avg_wrong_dot_products: Vec<f32> = matrix
                .rows()
                .into_iter()
                .map(|row| row.map(|x| 1.0 - *x).dot(&column_sums) / row.map(|x| 1.0 - *x).sum())
                .collect();
            let _ = matrix_tx.send(matrix); // done with matrix
            let mut ppl = vec![Person::default(); N];
            for i in 0..N {
                ppl[i].id = i + 1;
                ppl[i].avg_right = avg_right_dot_products[i];
                ppl[i].avg_wrong = avg_wrong_dot_products[i];
            }
            ppl.sort_by(|a, b| a.avg_wrong.partial_cmp(&b.avg_wrong).unwrap());
            for i in 0..N {
                let a = i.saturating_sub(2);
                let b = (i + 3).min(N);
                for i2 in a..b {
                    if i2 != i {
                        ppl[i].predicted_avg_right += ppl[i2].avg_right;
                    }
                }
                ppl[i].predicted_avg_right /= (b - a - 1) as f32;
            }
            let cheater = ppl
                .into_iter()
                .max_by(|a, b| {
                    a.cheating_enhancement()
                        .partial_cmp(&b.cheating_enhancement())
                        .unwrap()
                })
                .unwrap();
            result_tx.send((case_num, cheater.id)).unwrap();
        });
    }
    drop(result_tx);
    let mut results = vec![];
    while let Ok(res) = result_rx.recv() {
        results.push(res);
    }
    results.sort();
    let out = results
        .into_iter()
        .map(|(case_num, id)| format!("Case #{case_num}: {id}").to_string())
        .collect::<Vec<String>>()
        .join("\n");
    println!("{out}");
 }
diff --git a/generate_test_file.rs b/generate_test_file.rs
 use rand::{thread_rng, Rng};

 fn sigmoid(x: f64) -> f64 {
    1.0 / (1.0 + (-x).exp())
 }
 fn main() {
    let test_cases = 50usize;
    let cheat_rate = 0.5;
    let mut t = thread_rng();
    println!("{test_cases}");
    println!("86");
    for _ in 0..test_cases {
        let mut ppl = vec![0.0; 100];
        for p in &mut ppl {
            *p = t.gen_range(-3.0..=3.0);
        }
        let mut questions = vec![0.0; 10_000];
        for q in &mut questions {
            *q = t.gen_range(-3.0..3.0);
        }
        let mut matrix = vec![vec!["0"; 10_000]; 100];
        for i in 0..100 {
            for j in 0..10_000 {
                if t.gen_bool(sigmoid(ppl[i] - questions[j])) {
                    matrix[i][j] = "1";
                }
                if i == 0 {
                    // 0 is the cheater
                    if t.gen_bool(cheat_rate) {
                        // cheat
                        matrix[i][j] = "1";
                    }
                }
            }
        }

        for row in matrix {
            println!("{}", row.join(""));
        }
    }
 }
	/*
	- Blazing fast, highly accurate cheating detection algorithm
	- by Robert King, https://www.linkedin.com/in/robertkingnz/
	- Youtube walkthrough: https://youtu.be/CnIQkIseLGs
	- X https://x.com/robertkingNZ

	- BACKGROUND
	- The Google Code Jam team presented their 90% accurate algorithm after google code jam 2021.
	- During the contest, Rob discovered a ~100% accurate algorithm and so thought it was worth sharing.

	- THE PROBLEM (https://zibada.guru/gcj/2021qr/problems/#E):
	- A cheater flips a coin, if it's heads, they cheat, if it's tails they don't.
	- if they cheat their answer will be correct.
	- how can we catch the cheater?
	- Solutions need to be efficient as there's 100_000_000 quiz results.
	- Question difficulty and player skill level is evenly distributed
	- given boolean matrix of quiz results
	- matrix[i][j] == 1 with probability sigmoid(skill[i]-difficulty[j])


	- INNOVATIVE SOLUTION: (try solve it yourself first)
	- We can see the difficulty of a question by looking at percent of errors.
	- We know the cheater didn't cheat on the problems they got wrong
	- Therefore we can estimate a player's skill by looking at the average difficulty of the problems they got wrong.
	- with this proxy for skill, we can rank players exactly by skill (due to monotonically increasing curves)
	- e.g. if x < y, f(x) < f(y), so our sorting is accurate
	- after ranking by skill, we can compare a cheater to their neighbours to detect anomalies
	- the anomaly to look for is: the average difficulty of problems the cheater got correct is much higher relative to their neighbours
	- This is because cheating is more likely to flip results on harder problems, so it moves the average difficulty of correct answers higher.
	- we calculate this anomaly (cheating enhancement) for all players, and pick the max.


	- HIGH PERFORMANCE CODE:
	- Processes 50MB file in 0.26 seconds (100_000_000 quiz results) (my fast go lang implementation in the contest takes 9 seconds!)
	- Processes 1GB file in 0.4 seconds (1_000_000_000 quiz results) (just on my laptop)
	- uses a pool of buffers and a pool of ndarrays which are allocated up front and reused.
	- BufReader reads file at max speed in main thread, reading entire matrix at a time
	- As soon as a matrix is read, the next matrix is read (concurrently the previous buffer is processed by another thread)
	- pool items are CachePadded to the length of a cache line for efficient parallel computation.
	- the threadpool converts the buffer to an ndarray and passes the buffer back using sync_channel.
	- ndarrays are manipulated in place using SIMD, i.e. dot products, sums, inversions
	- SIMD allows a single CPU instruction to process multiple data points simultaneously
	- ndarray is passed back to the pool
	- matrix isn't large enough to make GPU acceleration worthwhile
	- no need to check for newline characters, these are treated as zero column in matrix and dont affect results.
	*/

	use crossbeam_utils::CachePadded;
	use ndarray;
	use std::fs::File;
	use std::io::{BufRead, BufReader, Read};
	use std::sync::mpsc::{channel, sync_channel};
	use threadpool::ThreadPool;

	const N: usize = 100; // num people (per test case)
	const M: usize = 10_000; // num questions (per test case)
	const NDARRAY_POOL_SIZE: usize = 20;
	const BUFF_POOL_SIZE: usize = 5;
	const N_WORKERS: usize = 20;

	fn read_metadata(reader: &mut BufReader<File>) -> usize {
	let mut buf = String::new();
	reader.read_line(&mut buf).unwrap();
	let t: usize = buf.trim_end().parse().unwrap();
	buf.clear();
	reader.read_line(&mut buf).unwrap();
	let _accuracy: usize = buf.trim().parse().unwrap();
	buf.clear();
	t
	}

	#[derive(Default, Clone, Debug)]
	struct Person {
	id: usize,
	avg_right: f32, // on average how difficult were the problems they got right
	avg_wrong: f32, // on average how difficult were the problems they got wrong
	predicted_avg_right: f32, // ranked by avg_wrong, predict avg_right using neighbours
	}

	impl Person {
	fn cheating_enhancement(&self) -> f32 {
	// the avg difficulty of what they got right
	// minus
	// the avg difficulty of what their neighbours (ranked by average difficulty of what they got wrong) got right
	self.avg_right - self.predicted_avg_right
	}
	}

	fn main() {
	// let f = File::open("test_data/sample_test_set_1/sample_ts1_input.txt").unwrap();
	// let f = File::open("test_data/test_set_2/ts2_input.txt").unwrap();
	let f = File::open("tst.txt").unwrap();
	let mut reader = BufReader::new(f);
	let (matrix_tx, matrix_rx) = sync_channel(NDARRAY_POOL_SIZE);
	for matrix in
	vec![CachePadded::new(ndarray::Array2::<f32>::default((N, M + 1))); NDARRAY_POOL_SIZE]
	{
	matrix_tx.send(matrix).unwrap();
	}
	let (buff_tx, buff_rx) = sync_channel(BUFF_POOL_SIZE);
	for buff in vec![CachePadded::new(vec![0; N * (M + 1)]); BUFF_POOL_SIZE] {
	buff_tx.send(buff).unwrap();
	}
	let (result_tx, result_rx) = channel();
	let thread_pool = ThreadPool::new(N_WORKERS);
	let test_cases = read_metadata(&mut reader);
	for case_num in 1..=test_cases {
	let mut buf = buff_rx.recv().unwrap();
	reader.read_exact(buf.as_mut_slice()).unwrap();
	let buff_tx = buff_tx.clone();
	let matrix_tx = matrix_tx.clone();
	let result_tx = result_tx.clone();
	let mut matrix = matrix_rx.recv().unwrap();
	thread_pool.execute(move \|\| {
	for (x, y) in matrix.iter_mut().zip(buf.iter()) {
	// y could be '1' \| '0' \| '\n'
	*x = y.saturating_sub(b'0') as f32;
	}
	let _ = buff_tx.send(buf); // done with buf
	let column_sums = matrix.sum_axis(ndarray::Axis(0)).map(\|x\| 1.0 - x) / (N as f32);
	let avg_right_dot_products: Vec<f32> = matrix
	.rows()
	.into_iter()
	.map(\|row\| row.dot(&column_sums) / row.sum())
	.collect();
	let avg_wrong_dot_products: Vec<f32> = matrix
	.rows()
	.into_iter()
	.map(\|row\| row.map(\|x\| 1.0 - x).dot(&column_sums) / row.map(\|x\| 1.0 - x).sum())
	.collect();
	let _ = matrix_tx.send(matrix); // done with matrix
	let mut ppl = vec![Person::default(); N];
	for i in 0..N {
	ppl[i].id = i + 1;
	ppl[i].avg_right = avg_right_dot_products[i];
	ppl[i].avg_wrong = avg_wrong_dot_products[i];
	}
	ppl.sort_by(\|a, b\| a.avg_wrong.partial_cmp(&b.avg_wrong).unwrap());
	for i in 0..N {
	let a = i.saturating_sub(2);
	let b = (i + 3).min(N);
	for i2 in a..b {
	if i2 != i {
	ppl[i].predicted_avg_right += ppl[i2].avg_right;
	}
	}
	ppl[i].predicted_avg_right /= (b - a - 1) as f32;
	}
	let cheater = ppl
	.into_iter()
	.max_by(\|a, b\| {
	a.cheating_enhancement()
	.partial_cmp(&b.cheating_enhancement())
	.unwrap()
	})
	.unwrap();
	result_tx.send((case_num, cheater.id)).unwrap();
	});
	}
	drop(result_tx);
	let mut results = vec![];
	while let Ok(res) = result_rx.recv() {
	results.push(res);
	}
	results.sort();
	let out = results
	.into_iter()
	.map(\|(case_num, id)\| format!("Case #{case_num}: {id}").to_string())
	.collect::<Vec<String>>()
	.join("\n");
	println!("{out}");
	}
	use rand::{thread_rng, Rng};

	fn sigmoid(x: f64) -> f64 {
	1.0 / (1.0 + (-x).exp())
	}
	fn main() {
	let test_cases = 50usize;
	let cheat_rate = 0.5;
	let mut t = thread_rng();
	println!("{test_cases}");
	println!("86");
	for _ in 0..test_cases {
	let mut ppl = vec![0.0; 100];
	for p in &mut ppl {
	*p = t.gen_range(-3.0..=3.0);
	}
	let mut questions = vec![0.0; 10_000];
	for q in &mut questions {
	*q = t.gen_range(-3.0..3.0);
	}
	let mut matrix = vec![vec!["0"; 10_000]; 100];
	for i in 0..100 {
	for j in 0..10_000 {
	if t.gen_bool(sigmoid(ppl[i] - questions[j])) {
	matrix[i][j] = "1";
	}
	if i == 0 {
	// 0 is the cheater
	if t.gen_bool(cheat_rate) {
	// cheat
	matrix[i][j] = "1";
	}
	}
	}
	}

	for row in matrix {
	println!("{}", row.join(""));
	}
	}
	}