Last active
July 26, 2024 08:54
-
-
Save robert-king/8f70f162a8a6231f0cfeec8e832ea1a7 to your computer and use it in GitHub Desktop.
Incredible cheating detection system
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
- Blazing fast, highly accurate cheating detection algorithm | |
- by Robert King, https://www.linkedin.com/in/robertkingnz/ | |
- Youtube walkthrough: https://youtu.be/CnIQkIseLGs | |
- X https://x.com/robertkingNZ | |
- BACKGROUND | |
- The Google Code Jam team presented their 90% accurate algorithm after google code jam 2021. | |
- During the contest, Rob discovered a ~100% accurate algorithm and so thought it was worth sharing. | |
- THE PROBLEM (https://zibada.guru/gcj/2021qr/problems/#E): | |
- A cheater flips a coin, if it's heads, they cheat, if it's tails they don't. | |
- if they cheat their answer will be correct. | |
- how can we catch the cheater? | |
- Solutions need to be efficient as there's 100_000_000 quiz results. | |
- Question difficulty and player skill level is evenly distributed | |
- given boolean matrix of quiz results | |
- matrix[i][j] == 1 with probability sigmoid(skill[i]-difficulty[j]) | |
- INNOVATIVE SOLUTION: (try solve it yourself first) | |
- We can see the difficulty of a question by looking at percent of errors. | |
- We know the cheater didn't cheat on the problems they got wrong | |
- Therefore we can estimate a player's skill by looking at the average difficulty of the problems they got wrong. | |
- with this proxy for skill, we can rank players exactly by skill (due to monotonically increasing curves) | |
- e.g. if x < y, f(x) < f(y), so our sorting is accurate | |
- after ranking by skill, we can compare a cheater to their neighbours to detect anomalies | |
- the anomaly to look for is: the average difficulty of problems the cheater got correct is much higher relative to their neighbours | |
- This is because cheating is more likely to flip results on harder problems, so it moves the average difficulty of correct answers higher. | |
- we calculate this anomaly (cheating enhancement) for all players, and pick the max. | |
- HIGH PERFORMANCE CODE: | |
- Processes 50MB file in 0.26 seconds (100_000_000 quiz results) (my fast go lang implementation in the contest takes 9 seconds!) | |
- Processes 1GB file in 0.4 seconds (1_000_000_000 quiz results) (just on my laptop) | |
- uses a pool of buffers and a pool of ndarrays which are allocated up front and reused. | |
- BufReader reads file at max speed in main thread, reading entire matrix at a time | |
- As soon as a matrix is read, the next matrix is read (concurrently the previous buffer is processed by another thread) | |
- pool items are CachePadded to the length of a cache line for efficient parallel computation. | |
- the threadpool converts the buffer to an ndarray and passes the buffer back using sync_channel. | |
- ndarrays are manipulated in place using SIMD, i.e. dot products, sums, inversions | |
- SIMD allows a single CPU instruction to process multiple data points simultaneously | |
- ndarray is passed back to the pool | |
- matrix isn't large enough to make GPU acceleration worthwhile | |
- no need to check for newline characters, these are treated as zero column in matrix and dont affect results. | |
*/ | |
use crossbeam_utils::CachePadded; | |
use ndarray; | |
use std::fs::File; | |
use std::io::{BufRead, BufReader, Read}; | |
use std::sync::mpsc::{channel, sync_channel}; | |
use threadpool::ThreadPool; | |
const N: usize = 100; // num people (per test case) | |
const M: usize = 10_000; // num questions (per test case) | |
const NDARRAY_POOL_SIZE: usize = 20; | |
const BUFF_POOL_SIZE: usize = 5; | |
const N_WORKERS: usize = 20; | |
fn read_metadata(reader: &mut BufReader<File>) -> usize { | |
let mut buf = String::new(); | |
reader.read_line(&mut buf).unwrap(); | |
let t: usize = buf.trim_end().parse().unwrap(); | |
buf.clear(); | |
reader.read_line(&mut buf).unwrap(); | |
let _accuracy: usize = buf.trim().parse().unwrap(); | |
buf.clear(); | |
t | |
} | |
#[derive(Default, Clone, Debug)] | |
struct Person { | |
id: usize, | |
avg_right: f32, // on average how difficult were the problems they got right | |
avg_wrong: f32, // on average how difficult were the problems they got wrong | |
predicted_avg_right: f32, // ranked by avg_wrong, predict avg_right using neighbours | |
} | |
impl Person { | |
fn cheating_enhancement(&self) -> f32 { | |
// the avg difficulty of what they got right | |
// minus | |
// the avg difficulty of what their neighbours (ranked by average difficulty of what they got wrong) got right | |
self.avg_right - self.predicted_avg_right | |
} | |
} | |
fn main() { | |
// let f = File::open("test_data/sample_test_set_1/sample_ts1_input.txt").unwrap(); | |
// let f = File::open("test_data/test_set_2/ts2_input.txt").unwrap(); | |
let f = File::open("tst.txt").unwrap(); | |
let mut reader = BufReader::new(f); | |
let (matrix_tx, matrix_rx) = sync_channel(NDARRAY_POOL_SIZE); | |
for matrix in | |
vec![CachePadded::new(ndarray::Array2::<f32>::default((N, M + 1))); NDARRAY_POOL_SIZE] | |
{ | |
matrix_tx.send(matrix).unwrap(); | |
} | |
let (buff_tx, buff_rx) = sync_channel(BUFF_POOL_SIZE); | |
for buff in vec![CachePadded::new(vec![0; N * (M + 1)]); BUFF_POOL_SIZE] { | |
buff_tx.send(buff).unwrap(); | |
} | |
let (result_tx, result_rx) = channel(); | |
let thread_pool = ThreadPool::new(N_WORKERS); | |
let test_cases = read_metadata(&mut reader); | |
for case_num in 1..=test_cases { | |
let mut buf = buff_rx.recv().unwrap(); | |
reader.read_exact(buf.as_mut_slice()).unwrap(); | |
let buff_tx = buff_tx.clone(); | |
let matrix_tx = matrix_tx.clone(); | |
let result_tx = result_tx.clone(); | |
let mut matrix = matrix_rx.recv().unwrap(); | |
thread_pool.execute(move || { | |
for (x, y) in matrix.iter_mut().zip(buf.iter()) { | |
// y could be '1' | '0' | '\n' | |
*x = y.saturating_sub(b'0') as f32; | |
} | |
let _ = buff_tx.send(buf); // done with buf | |
let column_sums = matrix.sum_axis(ndarray::Axis(0)).map(|x| 1.0 - x) / (N as f32); | |
let avg_right_dot_products: Vec<f32> = matrix | |
.rows() | |
.into_iter() | |
.map(|row| row.dot(&column_sums) / row.sum()) | |
.collect(); | |
let avg_wrong_dot_products: Vec<f32> = matrix | |
.rows() | |
.into_iter() | |
.map(|row| row.map(|x| 1.0 - *x).dot(&column_sums) / row.map(|x| 1.0 - *x).sum()) | |
.collect(); | |
let _ = matrix_tx.send(matrix); // done with matrix | |
let mut ppl = vec![Person::default(); N]; | |
for i in 0..N { | |
ppl[i].id = i + 1; | |
ppl[i].avg_right = avg_right_dot_products[i]; | |
ppl[i].avg_wrong = avg_wrong_dot_products[i]; | |
} | |
ppl.sort_by(|a, b| a.avg_wrong.partial_cmp(&b.avg_wrong).unwrap()); | |
for i in 0..N { | |
let a = i.saturating_sub(2); | |
let b = (i + 3).min(N); | |
for i2 in a..b { | |
if i2 != i { | |
ppl[i].predicted_avg_right += ppl[i2].avg_right; | |
} | |
} | |
ppl[i].predicted_avg_right /= (b - a - 1) as f32; | |
} | |
let cheater = ppl | |
.into_iter() | |
.max_by(|a, b| { | |
a.cheating_enhancement() | |
.partial_cmp(&b.cheating_enhancement()) | |
.unwrap() | |
}) | |
.unwrap(); | |
result_tx.send((case_num, cheater.id)).unwrap(); | |
}); | |
} | |
drop(result_tx); | |
let mut results = vec![]; | |
while let Ok(res) = result_rx.recv() { | |
results.push(res); | |
} | |
results.sort(); | |
let out = results | |
.into_iter() | |
.map(|(case_num, id)| format!("Case #{case_num}: {id}").to_string()) | |
.collect::<Vec<String>>() | |
.join("\n"); | |
println!("{out}"); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use rand::{thread_rng, Rng}; | |
fn sigmoid(x: f64) -> f64 { | |
1.0 / (1.0 + (-x).exp()) | |
} | |
fn main() { | |
let test_cases = 50usize; | |
let cheat_rate = 0.5; | |
let mut t = thread_rng(); | |
println!("{test_cases}"); | |
println!("86"); | |
for _ in 0..test_cases { | |
let mut ppl = vec![0.0; 100]; | |
for p in &mut ppl { | |
*p = t.gen_range(-3.0..=3.0); | |
} | |
let mut questions = vec![0.0; 10_000]; | |
for q in &mut questions { | |
*q = t.gen_range(-3.0..3.0); | |
} | |
let mut matrix = vec![vec!["0"; 10_000]; 100]; | |
for i in 0..100 { | |
for j in 0..10_000 { | |
if t.gen_bool(sigmoid(ppl[i] - questions[j])) { | |
matrix[i][j] = "1"; | |
} | |
if i == 0 { | |
// 0 is the cheater | |
if t.gen_bool(cheat_rate) { | |
// cheat | |
matrix[i][j] = "1"; | |
} | |
} | |
} | |
} | |
for row in matrix { | |
println!("{}", row.join("")); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment