-
-
Save alexcrichton/5cd36abd7bef3b57e4dd4033c7c3345b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![allow(warnings)] | |
#![crate_type = "rlib"] | |
use std::io::Read; | |
type Result<T> = std::result::Result<T, ()>; | |
pub struct TypeSectionReader<'a>(&'a ()); | |
pub struct ImportSectionReader<'a>(&'a ()); | |
pub struct ModuleSectionReader<'a>(&'a ()); | |
pub struct FunctionBody<'a>(&'a ()); | |
pub struct ElementSectionReader<'a>(&'a ()); | |
pub struct ExportSectionReader<'a>(&'a ()); | |
pub struct GlobalSectionReader<'a>(&'a ()); | |
pub struct MemorySectionReader<'a>(&'a ()); | |
pub struct TableSectionReader<'a>(&'a ()); | |
pub struct FunctionSectionReader<'a>(&'a ()); | |
pub struct InstanceSectionReader<'a>(&'a ()); | |
pub struct AliasSectionReader<'a>(&'a ()); | |
pub struct LocalsReader<'a>(&'a ()); | |
pub struct Data<'a>(&'a ()); | |
pub struct Operator<'a>(&'a ()); | |
pub struct OperatorsReader<'a>(&'a ()); | |
pub struct TypeSignature; | |
pub struct Type; | |
pub struct Parser { | |
// internally has a state machine which looks like: | |
// | |
// * parsing a section - in this state we're looking for the section header | |
// which indicates the section code and how big the section is. Depending | |
// on the section code this will indicate what form of chunk is returned. | |
// This also waits for some sections to be entirely resident before | |
// proceeding. | |
// | |
// * parsing functions - this has a count of how many functions are | |
// remaining and expects the next item to be a function chunk. | |
// | |
// * parsing modules - similar for functions, indicates how many nested | |
// modules remain. | |
// | |
// additionally there's an field which configures the maximum amount of data | |
// which can be parsed. This is used to limit the module/code sections, for | |
// example. Additionally it's used when nested `Parser` structures are | |
// returned for nested modules to ensure they consume a precise amount of | |
// data. | |
// | |
// While this is a state machine which is an avenue for complication, it's | |
// hoped that this is a relatively simple state machine since the main | |
// complexity is dealing with the streamed sections, but even then that's | |
// relatively simple. | |
_x: (), | |
} | |
impl Parser { | |
// Creates a new module parser. | |
// | |
// Reports errors relative to `offset` provided, where `offset` is some | |
// logical offset within the input stream that we're parsing.. | |
pub fn new(offset: usize) -> Parser { | |
unimplemented!() | |
} | |
// Attempts to parse a chunk of data. | |
// | |
// If a parse error happens, then `Err` is returned. Otherwise the number of | |
// bytes consumed and the parsed chunk is returned. See more docs on `Chunk` | |
// for what can be successfully parsed. | |
// | |
// If `eof` is `true` then it indicates that `data` is all the remaining | |
// data and no more data will be available for parsing. If `eof` is false | |
// then it means that more data may be coming in the future. | |
// | |
// It's expected that you parse until `Payload::End` is reached. For the | |
// top-level module that won't get returned until `data` is empty and `eof` | |
// is `true`. For sub-modules, however, `data` may have bytes in it or `eof` | |
// may be false when `End` is returned. | |
// | |
// TODO: maybe `&[IoVec]` as input? probably too fancy | |
pub fn parse<'a>(&mut self, data: &'a [u8], eof: bool) -> Result<Chunk<'a>> { | |
unimplemented!() | |
} | |
} | |
pub enum Chunk<'a> { | |
// This can be returned at any time and indicates that more data is needed | |
// to proceed with parsing. Zero bytes were consumed from the input to | |
// `parse` and `usize` more bytes are needed before making progress. | |
NeedMoreData(usize), | |
// A chunk was successfully parsed. | |
Parsed { | |
// This many bytes of the `data` input to `parse` were consumed to | |
// produce `payload`. | |
consumed: usize, | |
payload: Payload<'a>, | |
}, | |
} | |
pub enum Payload<'a> { | |
// Sections which are received in their entirety and then available for | |
// parsing. | |
// | |
// These sections are required to be entirely resident in memory | |
// before they're parsed. The payload of each of these variants is a slice | |
// into the original `data` passed to `parse` which outlines the entire | |
// section. | |
// | |
// Note that the presence of this chunk does not imply that this chunk is | |
// valid or will parse correctly. You'll need to at least iterate over this | |
// chunk or validate it to figure that out. | |
TypeSection(TypeSectionReader<'a>), | |
ImportSection(ImportSectionReader<'a>), | |
AliasSection(AliasSectionReader<'a>), | |
InstanceSection(InstanceSectionReader<'a>), | |
ModuleSection(ModuleSectionReader<'a>), | |
FunctionSection(FunctionSectionReader<'a>), | |
TableSection(TableSectionReader<'a>), | |
MemorySection(MemorySectionReader<'a>), | |
GlobalSection(GlobalSectionReader<'a>), | |
ExportSection(ExportSectionReader<'a>), | |
StartSection(u32), | |
ElementSection(ElementSectionReader<'a>), | |
DataCount(u32), | |
// The code section is a little more interesting since it's intended to be | |
// streamed. | |
// | |
// The purpose of this is so that `parse` doesn't require the entirety of | |
// all functions to be resident in memory before we return a chunk. This | |
// way we can parse functions as they come off the wire or modules as | |
// they're downloaded. Note, however, that functions are not internally | |
// incrementally parsed, they're required to be fully resident in memory | |
// before a chunk is returned. | |
// | |
// Note that `CodeStart(u32)` means that the section has started and `u32` | |
// `CodeSectionEntry` will be returned. You do not need to validate that | |
// there are `u32` items present, that will be validated internally. You're | |
// guaranteed the next `u32` successful chunks will be returned as | |
// `CodeSectionEntry`. If that doesn't happen then an error is otherwise | |
// returned. | |
CodeSectionStart(u32), | |
CodeSectionEntry(FunctionBody<'a>), | |
// This is similar to the code section where we want to allow streaming | |
// processing and don't want to require modules are fully resident in | |
// memory. | |
// | |
// Similar to the code section the first entry means that `u32` more | |
// `ModuleCodeEntry` entries will be returned. Unlike the code section this | |
// is a bit different. What happens here is that once you receive a | |
// `Parser`, then future data should *not* be fed into this parser. Instead | |
// data should be fed into the `Parser` given until it reports `End`. Once | |
// `End` is successfully seen then you should switch back to the original | |
// `Parser`. | |
ModuleCodeSectionStart(u32), | |
ModuleCodeSectionEntry(Parser), | |
// Like the code section, but allows streaming each individual data section | |
// entry, in case they're large. Again you're guaranteed that after seeing | |
// `DataSectionStart` you'll see that many `DataSectionEntry` payloads next. | |
// Or an error happens. | |
DataSectionStart(u32), | |
DataSectionEntry(Data<'a>), | |
// The end was successfully reached, and the module is entirely parsed. | |
End, | |
} | |
pub struct Validator { | |
// Imagine that implementation-wise this internally has state that looks like: | |
// | |
// state: Internal | |
// | |
// enum Internal { | |
// Owned(OwnedArc<InternalState>), | |
// Shared(Arc<InternalState>) | |
// } | |
// | |
// where `OwnedArc` is a newtype wrapper around `Arc<T>` which statically | |
// knows it has a singular reference count so mutable access is safe. It can | |
// then be converted to an `Arc<T>` later. For each of the "header" sections | |
// which mutate a module's state we'll be in the `Owned` state. Once we get | |
// to code/module sections, though, we switch to the `Shared` state since | |
// we don't mutate anything else (other than perhaps local counts of how | |
// many items are expected. | |
// | |
// The intention here is that the sub-validators (both for modules and | |
// functions) can refer to the parent's `Arc` data and are as a result owned | |
// values so we don't have to deal with lifetimes at all. | |
_x: (), | |
} | |
impl Validator { | |
pub fn new() -> Validator { | |
unimplemented!() | |
} | |
// Methods to validate each payload parsed from a wasm module. This'll get | |
// called for each `Payload` received, and there's a method for each | |
// variant. | |
pub fn type_section(&mut self, payload: TypeSectionReader<'_>) -> Result<()> { | |
unimplemented!() | |
} | |
pub fn code_section_start(&mut self, amt: u32) -> Result<()> { | |
unimplemented!() | |
} | |
// Returns a validator to validate the operators provided, and that work | |
// isn't done here since it's up to the application to either defer that or | |
// do it immediately. | |
pub fn code_section_entry<'a>( | |
&mut self, | |
body: FunctionBody<'a>, | |
) -> Result<(FuncValidator, OperatorsReader<'a>)> { | |
unimplemented!() | |
} | |
// Note that this does not take the `Parser` payload since the data comes | |
// from elsewhere. | |
// | |
// The validator returned should be used to validate all items that come out | |
// of the `Parser` as data is fed into the parser. | |
pub fn module_code_section_entry(&mut self) -> Result<Validator> { | |
unimplemented!() | |
} | |
pub fn end(&mut self) -> Result<()> { | |
unimplemented!() | |
} | |
// This would also include various accessors to figure out the type of each | |
// function, element, global, etc. Internally everything about aliasing with | |
// modules would be handled and you'd only have to deal with concrete types | |
// and such. | |
} | |
pub struct FuncValidator {} | |
impl FuncValidator { | |
// Called for each opcode read from a function. | |
// | |
// Each individual method can be called per-opcode too, no need to call both | |
// though. | |
pub fn op(&mut self, op: &Operator<'_>) -> Result<()> { | |
unimplemented!() | |
} | |
// Method for each individual opcode in case you're already decoding | |
// `Operator` yourself. | |
pub fn i32_add(&mut self) -> Result<()> { | |
unimplemented!() | |
} | |
// ... and each method can have a custom type signature, such as returning | |
// what type is being tested for null | |
pub fn ref_is_null(&mut self) -> Result<Type> { | |
unimplemented!() | |
} | |
// ... and each method can have custom arguments too. | |
// | |
// Here a `call` might return a custom signature which could then, if | |
// needed, get looked up to see what the params/returns are for that type | |
// siganture. For example we'd have the method here on this struct or on the | |
// original `Validator` to look it up. | |
pub fn call(&mut self, func: u32) -> Result<TypeSignature> { | |
unimplemented!() | |
} | |
// includes accessors to get current stack of types, etc | |
} | |
/// Example code of how these APIs might be used. The goal here is to: | |
/// | |
/// * Show an example of doing this with streaming parsing where we're learning | |
/// more about the module over time. | |
/// | |
/// * Validate as we parse instead of parsing twice. | |
/// | |
/// * Showing how nested modules are handled | |
/// | |
/// * Show how functions can be validated in parallel. | |
/// | |
/// Real code probably won't ever call this. This function, modulo its buffer | |
/// management which is application-specific, should be trivial to write. The | |
/// intention is that "real" applications will write this in their own crate and | |
/// sprinkle application-specific code throughout as necessary. | |
/// | |
/// For example cranelift's wasm-to-clif translation would have this intertwined | |
/// with what it does today. It might optionally validate since only wasmtime | |
/// needs that. | |
fn validate(mut reader: impl Read) -> Result<()> { | |
use Payload::*; | |
let mut buf = vec![0; 128]; | |
let mut parser = Parser::new(0); | |
let mut eof = false; | |
let mut functions_to_validate = Vec::new(); | |
let mut validator = Validator::new(); | |
let mut stack = Vec::new(); | |
loop { | |
let (payload, consumed) = match parser.parse(&buf, eof)? { | |
Chunk::NeedMoreData(hint) => { | |
assert!(eof); // otherwise an error would be returned | |
// Use the hint to preallocate more space | |
let len = buf.len(); | |
buf.extend((0..hint).map(|_| 0u8)); | |
// Actually do the read, would use `?` if we had real error | |
// types here. After the read truncate our input back to how | |
// many bytes have actually been read and set the eof flag if | |
// we've hit eof. | |
let n = reader.read(&mut buf[len..]).unwrap(); | |
buf.truncate(len + n); | |
eof = n == 0; | |
continue; | |
} | |
Chunk::Parsed { consumed, payload } => (payload, consumed), | |
}; | |
match payload { | |
TypeSection(s) => validator.type_section(s)?, | |
// imagine these are all filled in and pass through to the current | |
// validator | |
ImportSection(_) => {} | |
AliasSection(_) => {} | |
InstanceSection(_) => {} | |
ModuleSection(_) => {} | |
FunctionSection(_) => {} | |
TableSection(_) => {} | |
MemorySection(_) => {} | |
GlobalSection(_) => {} | |
ExportSection(_) => {} | |
StartSection(_) => {} | |
ElementSection(_) => {} | |
DataCount(_) => {} | |
// imagine this is filled in, nothing fancy other than informing the | |
// validator of the count we got | |
CodeSectionStart(_) => {} | |
CodeSectionEntry(body) => { | |
let (func_validator, ops) = validator.code_section_entry(body)?; | |
// imagine that we have the accessors here to figure out the | |
// byte offsets of `ops` within the original buffer. Then | |
// imagine that we translate that into some sort of rope thing | |
// where we peel off a section of the rope and push it onto the | |
// stack of functions to validate. This should all be easy | |
// enough to do in theory, and we could even easily just create | |
// a `Vec<u8>`. In any case this is just a shim for now. | |
drop(ops); | |
let function_data: Vec<u8> = Vec::new(); | |
// Note that this `functions_to_validate` list could be a | |
// work-stealing queue. While we're reading the main module | |
// other threads could be stealing off this queue to validate | |
// the functions. | |
functions_to_validate.push((func_validator, function_data)); | |
} | |
// imagine this is filled in, nothing fancy other than informing the | |
// validator of the count we got | |
ModuleCodeSectionStart(_) => {} | |
// Save our current parser/validator onto the stack of parent | |
// modules and then switch our context to a new parser and a new | |
// validator. | |
ModuleCodeSectionEntry(subparser) => { | |
let subvalidator = validator.module_code_section_entry()?; | |
stack.push((parser, validator)); | |
validator = subvalidator; | |
parser = subparser; | |
} | |
// imagine this is filled in, nothing fancy other than informing the | |
// validator of the items here | |
DataSectionStart(_) => {} | |
DataSectionEntry(_) => {} | |
// Once we've reached the end of a module we either resume at the | |
// parent module or we break out of the loop because we're done. | |
End => { | |
validator.end()?; | |
if let Some((parent_parser, parent_validator)) = stack.pop() { | |
parser = parent_parser; | |
validator = parent_validator; | |
} else { | |
break; | |
} | |
} | |
} | |
// once we're done processing the payload we can forget the original | |
// data. | |
buf.drain(..consumed); | |
} | |
// We in theory could have been work-stealing for the read above, but for | |
// now just show off how we can do this in parallel. | |
par_iter(functions_to_validate, |(mut validator, data)| { | |
// imagine translating `data` to an operator parser | |
let ops: OperatorsReader<'_> = loop {}; | |
loop { | |
// imagine this is read out of `ops` | |
let op: Operator<'_> = loop {}; | |
validator.op(&op)?; | |
} | |
}); | |
Ok(()) | |
} | |
fn par_iter<T: Send>(t: Vec<T>, f: impl Fn(T) -> Result<()> + Send + Sync) -> Result<()> { | |
unimplemented!() | |
} | |
fn _assert_send_sync() { | |
fn _assert<T: Send + Sync>() {} | |
_assert::<Parser>(); | |
_assert::<Validator>(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment