Last active
April 9, 2022 18:39
-
-
Save mohe2015/acc0f6daef95f3fcb1f402c2a71d2433 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::io::{self, BufReader}; | |
use flate2::read::GzDecoder; | |
use std::io::prelude::*; | |
use std::fs::File; | |
use rustc_hash::FxHashMap; | |
use base64::{decode_config_slice, STANDARD}; | |
// cat /home/pi/2022_place_canvas_history.csv.gzip | gunzip | wc -c | |
// 21714634193 | |
// ~22 GB | |
fn main() -> io::Result<()> { | |
let stdout = io::stdout(); | |
let mut handle = stdout.lock(); | |
let f = File::open("/home/pi/2022_place_canvas_history.csv.gzip")?; | |
let mut gz = BufReader::new(GzDecoder::new(f)); | |
let mut next_user_id = -1; | |
let mut next_pixel_color = -1; | |
let mut user_ids = FxHashMap::default(); | |
let mut pixel_colors = FxHashMap::default(); | |
let mut line = Vec::<u8>::new(); | |
let mut number_of_bytes_read = 0; | |
gz.read_until(b'\n', &mut line)?; // drop header | |
line.clear(); | |
loop { | |
let line_length = gz.read_until(b'\n', &mut line)?; | |
if line_length == 0 { | |
break; | |
} | |
number_of_bytes_read += line_length; | |
if number_of_bytes_read % (16 * 1024) == 0 { | |
eprintln!("{} %", f64::from(u32::try_from(number_of_bytes_read).unwrap()) / 21714634193f64); | |
} | |
if number_of_bytes_read >= 2171463419 { | |
break; // TESTING | |
} | |
// almost everything in a line should be at a fixed offset so maybe cheat | |
let mut it = line.split(|c| *c == b','); | |
let timestamp = it.next().unwrap(); | |
let mut user_id = vec![0; 64]; | |
assert_eq!(64, decode_config_slice(it.next().unwrap(), STANDARD, &mut user_id).unwrap()); | |
let user_id = match user_ids.get(&user_id) { | |
Some(v) => *v, | |
None => { | |
next_user_id += 1; | |
user_ids.insert(user_id.clone(), next_user_id); | |
next_user_id | |
} | |
}; | |
let pixel_color = u32::from_str_radix(std::str::from_utf8(&it.next().unwrap()[1..]).unwrap(), 16).unwrap(); | |
// TODO maybe hex decode or store statically as the colors should be known | |
let pixel_color = match pixel_colors.get(&pixel_color) { | |
Some(v) => *v, | |
None => { | |
next_pixel_color += 1; | |
pixel_colors.insert(pixel_color, next_pixel_color); | |
next_pixel_color | |
} | |
}; | |
let coordinate_x = std::str::from_utf8(&it.next().unwrap()[1..]).unwrap().parse::<u16>().unwrap(); | |
let coordinate_y = it.next().unwrap(); | |
let coordinate_y = std::str::from_utf8(&coordinate_y[..coordinate_y.len()-2]).unwrap().parse::<u16>().unwrap(); | |
writeln!(handle, "{},{},{},{},{}", std::str::from_utf8(timestamp).unwrap(), user_id, pixel_color, coordinate_x, coordinate_y)?; | |
line.clear(); | |
} | |
eprintln!("{:#?}", pixel_colors); | |
eprintln!("user count: {}", next_user_id); | |
Ok(()) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sudo apt install -y postgresql | |
curl -O https://placedata.reddit.com/data/canvas-history/2022_place_canvas_history.csv.gzip | |
sudo -u postgres psql --user postgres | |
CREATE TABLE rplace (coordinate POINT NOT NULL, pixel_color CHAR(7) NOT NULL, user_id BYTEA NOT NULL, timestamp TIMESTAMP WITHOUT TIME ZONE NOT NULL); | |
COPY rplace (timestamp, user_id, pixel_color, coordinate) FROM PROGRAM 'gunzip -c /home/pi/2022_place_canvas_history.csv.gzip | grep -v -P "\"\d+,\d+,\d+,\d+\""' WITH CSV HEADER; | |
VACUUM (VERBOSE, ANALYZE) rplace; | |
EXPLAIN ANALYZE SELECT COUNT(*) FROM rplace; | |
set maintenance_work_mem TO "4GB"; | |
CREATE INDEX timestamp_idx ON rplace USING BRIN (timestamp); | |
CREATE INDEX coordinate_idx ON rplace USING GIST (coordinate); | |
CREATE INDEX user_id_idx ON rplace USING HASH (user_id); | |
CREATE INDEX pixel_color_idx ON rplace USING HASH (pixel_color); | |
ALTER SYSTEM SET maintenance_work_mem = '4GB'; | |
SELECT pg_reload_conf(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment