Created
October 6, 2023 08:43
-
-
Save sangelxyz/6d7c177ad521d6dd158bed97b45fbc29 to your computer and use it in GitHub Desktop.
data processing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fn main() { | |
let json_str = std::fs::read_to_string("../posts.json").unwrap(); | |
let posts: Vec<Post> = from_str(&json_str).unwrap(); | |
let start = Instant::now(); | |
let mut post_tags_map: FxHashMap<&str, Vec<u16>> = FxHashMap::default(); | |
for (post_idx, post) in posts.iter().enumerate() { | |
for tag in post.tags.iter() { | |
post_tags_map.entry(tag).or_default().push(post_idx as u16); | |
} | |
} | |
let start2 = Instant::now(); | |
let related_posts: Vec<RelatedPosts<'_>> = posts | |
.iter() | |
.enumerate() | |
.map(|(post_idx, post)| { | |
// faster than allocating outside the loop | |
let mut tagged_post_count = vec![0u16; posts.len()]; | |
post.tags | |
.iter() | |
.flat_map(|tag| post_tags_map.get::<str>(tag.as_ref()).into_iter().flatten()) | |
.for_each(|&other_post_idx| tagged_post_count[other_post_idx as usize] += 1); | |
tagged_post_count[post_idx] = 0; // don't recommend the same post | |
let top = least_n( | |
NUM_TOP_ITEMS, | |
tagged_post_count | |
.iter() | |
.enumerate() | |
.map(|(post, &count)| PostCount { | |
post: post as u16, | |
count, | |
}), | |
); | |
//let related = top.map(|it| &posts[it.post as usize]).collect(); | |
let related: Vec<&Post> = top.map(|it| &posts[it.post as usize]).collect(); | |
RelatedPosts { | |
_id: &post._id, | |
tags: &post.tags, | |
related, | |
} | |
}) | |
.collect(); | |
let end2 = Instant::now(); | |
let end = Instant::now(); | |
// I have no explanation for why, but doing this before the print improves performance pretty | |
// significantly (15%) when using slices in the hashmap key and RelatedPosts | |
let json_str = serde_json::to_string(&related_posts).unwrap(); | |
print!( | |
"Processing time (w/o IO): {:?} {:?}\n", | |
end.duration_since(start), | |
end2.duration_since(start2), | |
); | |
std::fs::write("../related_posts_rust.json", json_str).unwrap(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment