Created
July 6, 2024 22:51
-
-
Save trevorbernard/8ea34911b7aa4d6f1f93de1b318dd5b2 to your computer and use it in GitHub Desktop.
Ingestion Experiment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use glob::glob; | |
use std::{ | |
fs, | |
path::{Path, PathBuf}, time::Instant, | |
}; | |
/// The BlockFileMetaData represents the parsed meta data from the | |
/// Precomputed Block filename. | |
/// | |
/// Each mainnet precomputed block has the following grammar. | |
/// | |
/// filename ::= network "-" block_height "-" state_hash ".json" ; | |
/// network ::= "mainnet" ; | |
/// block_height ::= digit+ ; | |
/// state_hash ::= "3N" alphanumeric{50} ; | |
/// digit ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; | |
/// alpha ::= "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | | |
/// "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | | |
/// "w" | "x" | "y" | "z" | "A" | "B" | "C" | "D" | "E" | "F" | "G" | | |
/// "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | | |
/// "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" | digit ; | |
#[derive(Debug)] | |
pub struct BlockFileMetaData<'a> { | |
/// The block height of a precomputed block | |
pub(crate) height: usize, | |
/// The block network of the precomputed block | |
pub(crate) network: &'a str, | |
/// The block state hash of the precomputed block | |
pub(crate) state_hash: &'a str, | |
} | |
/// The [BlockFile] represents | |
#[derive(Debug)] | |
pub struct BlockFile<'a> { | |
/// The underlying Path reference to the block | |
pub(crate) path: &'a Path, | |
/// The filename metadata for the block | |
pub(crate) metadata: BlockFileMetaData<'a>, | |
} | |
impl<'a> BlockFileMetaData<'a> { | |
/// Value precomputed block height | |
pub fn height(&self) -> usize { | |
self.height | |
} | |
/// Value precomputed block network | |
pub fn network(&self) -> &'a str { | |
self.network | |
} | |
/// Value precomputed block state hash | |
pub fn state_hash(&self) -> &'a str { | |
self.state_hash | |
} | |
} | |
impl<'a> BlockFileMetaData<'a> { | |
/// Efficiently parse metadata from filename of a precomputed | |
/// block. It does rudimentary but fast validation ensuring the | |
/// block height gets parsed into a [`usize`] and the state_hash | |
/// starts with "3N" and is 52 characters long. | |
/// | |
/// This function keeps reference from the underlying PathBuf | |
/// eliminating unnessary allocations. | |
pub fn from_filename<P: AsRef<Path> + 'a>(path: &'a P) -> Option<Self> { | |
let filename = path.as_ref().file_name()?.to_str()?; | |
// Ensure the filename ends with ".json" | |
if !filename.ends_with(".json") { | |
return None; | |
} | |
// Remove the ".json" extension | |
let name_without_ext = &filename[..filename.len() - 5]; | |
// Find the positions of the hyphens | |
let first_dash = name_without_ext.find('-')?; | |
let second_dash = name_without_ext[first_dash + 1..].find('-')? + first_dash + 1; | |
// Extract the network, block height, and state hash | |
let network = &name_without_ext[..first_dash]; | |
let height_str = &name_without_ext[first_dash + 1..second_dash]; | |
let state_hash = &name_without_ext[second_dash + 1..]; | |
// Parse the block height | |
let height = height_str.parse().ok()?; | |
// Ensure the state hash starts with "3N" and is 52 characters long | |
if !state_hash.starts_with("3N") || state_hash.len() != 52 { | |
return None; | |
} | |
Some(BlockFileMetaData { | |
height, | |
network, | |
state_hash, | |
}) | |
} | |
} | |
/// The [BlockIngestionSummary] represents the the results of the | |
/// Precomputed block ingestion process | |
#[derive(Debug, Default, PartialEq)] | |
pub struct BlockIngestionSummary { | |
/// Value the total number of blocks in the ingestion directory | |
pub(crate) total_blocks_count: usize, | |
/// Value the total number of canonical blocks in the ingestion directory | |
pub(crate) total_canonical_blocks_count: usize, | |
/// Value the total number of orphaned blocks in the ingestion directory | |
pub(crate) total_orphaned_blocks_count: usize, | |
/// Value the total number of pending blocks in the ingestion directory | |
pub(crate) total_pending_blocks_count: usize, | |
} | |
/// It accepts a slice of [`AsRef<Path>`] and returns a | |
/// [`Vec<BlockFile<'a>`] in ascending order by block height | |
fn get_sorted_block_files<'a, P: AsRef<Path>>( | |
paths: &'a [P], | |
) -> anyhow::Result<Vec<BlockFile<'a>>> { | |
let mut block_files: Vec<_> = paths | |
.iter() | |
.filter_map(|filename| { | |
BlockFileMetaData::from_filename(filename).map(|metadata| BlockFile { | |
path: filename.as_ref(), | |
metadata, | |
}) | |
}) | |
.collect(); | |
// No need to use [`sort_by_cached_key`] here since the key function isn't | |
// expensive. Sort in ascending order | |
block_files.sort_by_key(|bf| bf.metadata.height); | |
Ok(block_files) | |
} | |
/// Ingest a directory of Precomputed Blocks to create an initial | |
/// database. | |
/// | |
/// This is a blocking operation that should be called from an OS | |
/// thread. If you are using a async runtime like [tokio], be aware | |
/// that this operation won't be cancelable since it uses blocking | |
/// I/O and will lock up a Runtime thread. | |
pub fn ingest_blocks<P: AsRef<Path>>(path: P) -> anyhow::Result<BlockIngestionSummary> { | |
// Validate that the path_ref is a none empty directory | |
let path_ref = path.as_ref(); | |
if !path_ref.is_dir() { | |
log::warn!("path must be a directory: {}", path_ref.display()); | |
return Ok(BlockIngestionSummary::default()); | |
} else { | |
let mut entries = fs::read_dir(path_ref)?; | |
if entries.next().is_none() { | |
log::warn!("path must have entries: {}", path_ref.display()); | |
return Ok(BlockIngestionSummary::default()); | |
} | |
} | |
// Find the canonical chain and ingest w/o adding to the witness tree | |
let time = Instant::now(); | |
let pattern = format!("{}/*-*-*.json", path.as_ref().display()); | |
let filenames: Vec<PathBuf> = glob(&pattern)?.filter_map(|x| x.ok()).collect(); | |
let _sorted_block_files = match get_sorted_block_files(filenames.as_slice()) { | |
Ok(sorted) => sorted, | |
Err(e) => panic!("Unable to sort block files: {e}"), | |
}; | |
let elapsed = time.elapsed(); | |
let size = _sorted_block_files.len(); | |
println!("Sorted block files {size} in: {elapsed:?} ms"); | |
Ok(BlockIngestionSummary::default()) | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::BlockFileMetaData; | |
use crate::ingestion::{ingest_blocks, BlockIngestionSummary}; | |
use tempfile::NamedTempFile; | |
#[test] | |
fn test_foobar() -> anyhow::Result<()> { | |
let path = "/Users/tbernard/blocks/100000-blocks"; | |
let _ = ingest_blocks(path); | |
Ok(()) | |
} | |
#[test] | |
fn test_ingestion_on_empty_dir() -> anyhow::Result<()> { | |
let tmp_dir = tempfile::tempdir().expect("empty tmp dir"); | |
let path = tmp_dir.path(); | |
let summary = ingest_blocks(path)?; | |
assert_eq!(BlockIngestionSummary::default(), summary); | |
Ok(()) | |
} | |
#[test] | |
fn test_ingestion_on_non_dir() -> anyhow::Result<()> { | |
let tmp_file = NamedTempFile::new().expect("tmp file"); | |
let path = tmp_file.path(); | |
let summary = ingest_blocks(path)?; | |
assert_eq!(BlockIngestionSummary::default(), summary); | |
Ok(()) | |
} | |
#[test] | |
fn test_invalid_block_height_filename() -> anyhow::Result<()> { | |
let filename = "mainnet-3596b04-3NLRTfY4kZyJtvaP4dFenDcxfoMfT3uEpkWS913KkeXLtziyVd15.json"; | |
let block_file_meta = BlockFileMetaData::from_filename(&filename); | |
assert!(block_file_meta.is_none()); | |
Ok(()) | |
} | |
#[test] | |
fn test_invalid_extension_filename() -> anyhow::Result<()> { | |
let filename = "mainnet-359604-3NLRTfY4kZyJtvaP4dFenDcxfoMfT3uEpkWS913KkeXLtziyVd15.foobar"; | |
let block_file_meta = BlockFileMetaData::from_filename(&filename); | |
assert!(block_file_meta.is_none()); | |
Ok(()) | |
} | |
#[test] | |
fn test_valid_filename() -> anyhow::Result<()> { | |
let filename = "mainnet-359604-3NLRTfY4kZyJtvaP4dFenDcxfoMfT3uEpkWS913KkeXLtziyVd15.json"; | |
if let Some(block_file_meta) = BlockFileMetaData::from_filename(&filename) { | |
let network = block_file_meta.network(); | |
let height = block_file_meta.height(); | |
let state_hash = block_file_meta.state_hash(); | |
assert_eq!("mainnet", network); | |
assert_eq!(359604_usize, height); | |
assert_eq!( | |
"3NLRTfY4kZyJtvaP4dFenDcxfoMfT3uEpkWS913KkeXLtziyVd15", | |
state_hash | |
); | |
} else { | |
panic!("Unable to parse block file metadata"); | |
} | |
Ok(()) | |
} | |
#[test] | |
fn test_valid_long_path() -> anyhow::Result<()> { | |
let filename = "tests/data/sequential_blocks/mainnet-105494-3NKXsaznJ6WdyA4PHfXxn25RzVanzQsNMZrxjidbhoBug8R4LZDy.json"; | |
if let Some(block_file_meta) = BlockFileMetaData::from_filename(&filename) { | |
let network = block_file_meta.network(); | |
let height = block_file_meta.height(); | |
let state_hash = block_file_meta.state_hash(); | |
assert_eq!("mainnet", network); | |
assert_eq!(105494_usize, height); | |
assert_eq!( | |
"3NKXsaznJ6WdyA4PHfXxn25RzVanzQsNMZrxjidbhoBug8R4LZDy", | |
state_hash | |
); | |
} else { | |
panic!("Unable to parse block file metadata"); | |
} | |
Ok(()) | |
} | |
// | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment