Skip to content

Instantly share code, notes, and snippets.

@robert-king
Last active July 26, 2024 08:54
Show Gist options
  • Save robert-king/8f70f162a8a6231f0cfeec8e832ea1a7 to your computer and use it in GitHub Desktop.
Save robert-king/8f70f162a8a6231f0cfeec8e832ea1a7 to your computer and use it in GitHub Desktop.
Incredible cheating detection system
/*
- Blazing fast, highly accurate cheating detection algorithm
- by Robert King, https://www.linkedin.com/in/robertkingnz/
- Youtube walkthrough: https://youtu.be/CnIQkIseLGs
- X https://x.com/robertkingNZ
- BACKGROUND
- The Google Code Jam team presented their 90% accurate algorithm after google code jam 2021.
- During the contest, Rob discovered a ~100% accurate algorithm and so thought it was worth sharing.
- THE PROBLEM (https://zibada.guru/gcj/2021qr/problems/#E):
- A cheater flips a coin, if it's heads, they cheat, if it's tails they don't.
- if they cheat their answer will be correct.
- how can we catch the cheater?
- Solutions need to be efficient as there's 100_000_000 quiz results.
- Question difficulty and player skill level is evenly distributed
- given boolean matrix of quiz results
- matrix[i][j] == 1 with probability sigmoid(skill[i]-difficulty[j])
- INNOVATIVE SOLUTION: (try solve it yourself first)
- We can see the difficulty of a question by looking at percent of errors.
- We know the cheater didn't cheat on the problems they got wrong
- Therefore we can estimate a player's skill by looking at the average difficulty of the problems they got wrong.
- with this proxy for skill, we can rank players exactly by skill (due to monotonically increasing curves)
- e.g. if x < y, f(x) < f(y), so our sorting is accurate
- after ranking by skill, we can compare a cheater to their neighbours to detect anomalies
- the anomaly to look for is: the average difficulty of problems the cheater got correct is much higher relative to their neighbours
- This is because cheating is more likely to flip results on harder problems, so it moves the average difficulty of correct answers higher.
- we calculate this anomaly (cheating enhancement) for all players, and pick the max.
- HIGH PERFORMANCE CODE:
- Processes 50MB file in 0.26 seconds (100_000_000 quiz results) (my fast go lang implementation in the contest takes 9 seconds!)
- Processes 1GB file in 0.4 seconds (1_000_000_000 quiz results) (just on my laptop)
- uses a pool of buffers and a pool of ndarrays which are allocated up front and reused.
- BufReader reads file at max speed in main thread, reading entire matrix at a time
- As soon as a matrix is read, the next matrix is read (concurrently the previous buffer is processed by another thread)
- pool items are CachePadded to the length of a cache line for efficient parallel computation.
- the threadpool converts the buffer to an ndarray and passes the buffer back using sync_channel.
- ndarrays are manipulated in place using SIMD, i.e. dot products, sums, inversions
- SIMD allows a single CPU instruction to process multiple data points simultaneously
- ndarray is passed back to the pool
- matrix isn't large enough to make GPU acceleration worthwhile
- no need to check for newline characters, these are treated as zero column in matrix and dont affect results.
*/
use crossbeam_utils::CachePadded;
use ndarray;
use std::fs::File;
use std::io::{BufRead, BufReader, Read};
use std::sync::mpsc::{channel, sync_channel};
use threadpool::ThreadPool;
const N: usize = 100; // num people (per test case)
const M: usize = 10_000; // num questions (per test case)
const NDARRAY_POOL_SIZE: usize = 20;
const BUFF_POOL_SIZE: usize = 5;
const N_WORKERS: usize = 20;
fn read_metadata(reader: &mut BufReader<File>) -> usize {
let mut buf = String::new();
reader.read_line(&mut buf).unwrap();
let t: usize = buf.trim_end().parse().unwrap();
buf.clear();
reader.read_line(&mut buf).unwrap();
let _accuracy: usize = buf.trim().parse().unwrap();
buf.clear();
t
}
#[derive(Default, Clone, Debug)]
struct Person {
id: usize,
avg_right: f32, // on average how difficult were the problems they got right
avg_wrong: f32, // on average how difficult were the problems they got wrong
predicted_avg_right: f32, // ranked by avg_wrong, predict avg_right using neighbours
}
impl Person {
fn cheating_enhancement(&self) -> f32 {
// the avg difficulty of what they got right
// minus
// the avg difficulty of what their neighbours (ranked by average difficulty of what they got wrong) got right
self.avg_right - self.predicted_avg_right
}
}
fn main() {
// let f = File::open("test_data/sample_test_set_1/sample_ts1_input.txt").unwrap();
// let f = File::open("test_data/test_set_2/ts2_input.txt").unwrap();
let f = File::open("tst.txt").unwrap();
let mut reader = BufReader::new(f);
let (matrix_tx, matrix_rx) = sync_channel(NDARRAY_POOL_SIZE);
for matrix in
vec![CachePadded::new(ndarray::Array2::<f32>::default((N, M + 1))); NDARRAY_POOL_SIZE]
{
matrix_tx.send(matrix).unwrap();
}
let (buff_tx, buff_rx) = sync_channel(BUFF_POOL_SIZE);
for buff in vec![CachePadded::new(vec![0; N * (M + 1)]); BUFF_POOL_SIZE] {
buff_tx.send(buff).unwrap();
}
let (result_tx, result_rx) = channel();
let thread_pool = ThreadPool::new(N_WORKERS);
let test_cases = read_metadata(&mut reader);
for case_num in 1..=test_cases {
let mut buf = buff_rx.recv().unwrap();
reader.read_exact(buf.as_mut_slice()).unwrap();
let buff_tx = buff_tx.clone();
let matrix_tx = matrix_tx.clone();
let result_tx = result_tx.clone();
let mut matrix = matrix_rx.recv().unwrap();
thread_pool.execute(move || {
for (x, y) in matrix.iter_mut().zip(buf.iter()) {
// y could be '1' | '0' | '\n'
*x = y.saturating_sub(b'0') as f32;
}
let _ = buff_tx.send(buf); // done with buf
let column_sums = matrix.sum_axis(ndarray::Axis(0)).map(|x| 1.0 - x) / (N as f32);
let avg_right_dot_products: Vec<f32> = matrix
.rows()
.into_iter()
.map(|row| row.dot(&column_sums) / row.sum())
.collect();
let avg_wrong_dot_products: Vec<f32> = matrix
.rows()
.into_iter()
.map(|row| row.map(|x| 1.0 - *x).dot(&column_sums) / row.map(|x| 1.0 - *x).sum())
.collect();
let _ = matrix_tx.send(matrix); // done with matrix
let mut ppl = vec![Person::default(); N];
for i in 0..N {
ppl[i].id = i + 1;
ppl[i].avg_right = avg_right_dot_products[i];
ppl[i].avg_wrong = avg_wrong_dot_products[i];
}
ppl.sort_by(|a, b| a.avg_wrong.partial_cmp(&b.avg_wrong).unwrap());
for i in 0..N {
let a = i.saturating_sub(2);
let b = (i + 3).min(N);
for i2 in a..b {
if i2 != i {
ppl[i].predicted_avg_right += ppl[i2].avg_right;
}
}
ppl[i].predicted_avg_right /= (b - a - 1) as f32;
}
let cheater = ppl
.into_iter()
.max_by(|a, b| {
a.cheating_enhancement()
.partial_cmp(&b.cheating_enhancement())
.unwrap()
})
.unwrap();
result_tx.send((case_num, cheater.id)).unwrap();
});
}
drop(result_tx);
let mut results = vec![];
while let Ok(res) = result_rx.recv() {
results.push(res);
}
results.sort();
let out = results
.into_iter()
.map(|(case_num, id)| format!("Case #{case_num}: {id}").to_string())
.collect::<Vec<String>>()
.join("\n");
println!("{out}");
}
use rand::{thread_rng, Rng};
fn sigmoid(x: f64) -> f64 {
1.0 / (1.0 + (-x).exp())
}
fn main() {
let test_cases = 50usize;
let cheat_rate = 0.5;
let mut t = thread_rng();
println!("{test_cases}");
println!("86");
for _ in 0..test_cases {
let mut ppl = vec![0.0; 100];
for p in &mut ppl {
*p = t.gen_range(-3.0..=3.0);
}
let mut questions = vec![0.0; 10_000];
for q in &mut questions {
*q = t.gen_range(-3.0..3.0);
}
let mut matrix = vec![vec!["0"; 10_000]; 100];
for i in 0..100 {
for j in 0..10_000 {
if t.gen_bool(sigmoid(ppl[i] - questions[j])) {
matrix[i][j] = "1";
}
if i == 0 {
// 0 is the cheater
if t.gen_bool(cheat_rate) {
// cheat
matrix[i][j] = "1";
}
}
}
}
for row in matrix {
println!("{}", row.join(""));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment