Created
July 27, 2013 20:01
-
-
Save lifthrasiir/6096084 to your computer and use it in GitHub Desktop.
Character encoding interface for Rust (proof-of-concept)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mod util { | |
use std::str::CharRange; | |
pub struct StrCharIndexIterator<'self> { | |
priv index: uint, | |
priv string: &'self str, | |
} | |
impl<'self> Iterator<(uint, uint, char)> for StrCharIndexIterator<'self> { | |
#[inline] | |
fn next(&mut self) -> Option<(uint, uint, char)> { | |
if self.index < self.string.len() { | |
let CharRange {ch, next} = self.string.char_range_at(self.index); | |
let index = self.index; | |
self.index = next; | |
Some((index, next, ch)) | |
} else { | |
None | |
} | |
} | |
} | |
pub trait StrCharIndex<'self> { | |
fn index_iter(&self) -> StrCharIndexIterator<'self>; | |
} | |
impl<'self> StrCharIndex<'self> for &'self str { | |
fn index_iter(&self) -> StrCharIndexIterator<'self> { | |
StrCharIndexIterator { index: 0, string: *self } | |
} | |
} | |
} | |
pub mod types { | |
pub struct CodecError<Remaining,Problem> { | |
remaining: Remaining, | |
problem: Problem, | |
cause: ~str, | |
} | |
pub trait Encoder { | |
pub fn encoding(&self) -> ~Encoding; | |
pub fn feed<'r>(&mut self, input: &'r str) -> (~[u8],Option<CodecError<&'r str,~str>>); | |
pub fn flush(~self) -> (~[u8],Option<CodecError<(),~str>>); | |
} | |
pub trait Decoder { | |
pub fn encoding(&self) -> ~Encoding; | |
pub fn feed<'r>(&mut self, input: &'r [u8]) -> (~str,Option<CodecError<&'r [u8],~[u8]>>); | |
pub fn flush(~self) -> (~str,Option<CodecError<(),~[u8]>>); | |
} | |
pub trait Encoding { | |
pub fn name(&self) -> ~str; | |
pub fn encoder(&self) -> ~Encoder; | |
pub fn decoder(&self) -> ~Decoder; | |
pub fn preferred_replacement_seq(&self) -> ~[u8]; | |
} | |
pub trait EncodingUtil<T:Encoding> { | |
pub fn encode<Trap:EncoderTrap<T>>(&self, input: &str, trap: Trap) -> Result<~[u8],~str>; | |
pub fn decode<Trap:DecoderTrap<T>>(&self, input: &[u8], trap: Trap) -> Result<~str,~str>; | |
} | |
impl<T:Encoding> EncodingUtil<T> for T { | |
#[inline] | |
pub fn encode<Trap:EncoderTrap<T>>(&self, input: &str, mut trap: Trap) -> Result<~[u8],~str> { | |
let mut encoder = self.encoder(); | |
let mut remaining = input; | |
let mut ret = ~[]; | |
loop { | |
let (encoded, err) = encoder.feed(remaining); | |
ret.push_all(encoded); | |
match err { | |
Some(err) => { | |
match trap.encoder_trap(self, err.problem) { | |
Some(s) => { ret.push_all(s); } | |
None => { return Err(err.cause); } | |
} | |
remaining = err.remaining; | |
} | |
None => break | |
} | |
} | |
let (encoded, err) = encoder.flush(); | |
ret.push_all(encoded); | |
match err { | |
Some(err) => { | |
match trap.encoder_trap(self, err.problem) { | |
Some(s) => { ret.push_all(s); } | |
None => { return Err(err.cause); } | |
} | |
} | |
None => {} | |
} | |
Ok(ret) | |
} | |
pub fn decode<Trap:DecoderTrap<T>>(&self, input: &[u8], mut trap: Trap) -> Result<~str,~str> { | |
let mut decoder = self.decoder(); | |
let mut remaining = input; | |
let mut ret = ~""; | |
loop { | |
let (decoded, err) = decoder.feed(remaining); | |
ret.push_str(decoded); | |
match err { | |
Some(err) => { | |
match trap.decoder_trap(self, err.problem) { | |
Some(s) => { ret.push_str(s); } | |
None => { return Err(err.cause); } | |
} | |
remaining = err.remaining; | |
} | |
None => break | |
} | |
} | |
let (decoded, err) = decoder.flush(); | |
ret.push_str(decoded); | |
match err { | |
Some(err) => { | |
match trap.decoder_trap(self, err.problem) { | |
Some(s) => { ret.push_str(s); } | |
None => { return Err(err.cause); } | |
} | |
} | |
None => {} | |
} | |
Ok(ret) | |
} | |
} | |
pub trait EncoderTrap<T:Encoding> { | |
pub fn encoder_trap(&mut self, encoding: &T, input: &str) -> Option<~[u8]>; | |
} | |
pub trait DecoderTrap<T:Encoding> { | |
pub fn decoder_trap(&mut self, encoding: &T, input: &[u8]) -> Option<~str>; | |
} | |
impl<'self,T:Encoding> EncoderTrap<T> for &'self fn(&T,&str) -> Option<~[u8]> { | |
pub fn encoder_trap(&mut self, encoding: &T, input: &str) -> Option<~[u8]> { | |
(*self)(encoding, input) | |
} | |
} | |
impl<'self,T:Encoding> DecoderTrap<T> for &'self fn(&T,&[u8]) -> Option<~str> { | |
pub fn decoder_trap(&mut self, encoding: &T, input: &[u8]) -> Option<~str> { | |
(*self)(encoding, input) | |
} | |
} | |
} | |
pub mod trap { | |
use types::*; | |
pub struct Strict; | |
impl<T:Encoding> EncoderTrap<T> for Strict { | |
#[inline] | |
pub fn encoder_trap(&mut self, _encoding: &T, _input: &str) -> Option<~[u8]> { | |
None | |
} | |
} | |
impl<T:Encoding> DecoderTrap<T> for Strict { | |
#[inline] | |
pub fn decoder_trap(&mut self, _encoding: &T, _input: &[u8]) -> Option<~str> { | |
None | |
} | |
} | |
pub struct Replace; | |
impl<T:Encoding> EncoderTrap<T> for Replace { | |
#[inline] | |
pub fn encoder_trap(&mut self, encoding: &T, _input: &str) -> Option<~[u8]> { | |
Some(encoding.preferred_replacement_seq()) | |
} | |
} | |
impl<T:Encoding> DecoderTrap<T> for Replace { | |
#[inline] | |
pub fn decoder_trap(&mut self, _encoding: &T, _input: &[u8]) -> Option<~str> { | |
Some(~"\ufffd") | |
} | |
} | |
pub struct Ignore; | |
impl<T:Encoding> EncoderTrap<T> for Ignore { | |
#[inline] | |
pub fn encoder_trap(&mut self, _encoding: &T, _input: &str) -> Option<~[u8]> { | |
Some(~[]) | |
} | |
} | |
impl<T:Encoding> DecoderTrap<T> for Ignore { | |
#[inline] | |
pub fn decoder_trap(&mut self, _encoding: &T, _input: &[u8]) -> Option<~str> { | |
Some(~"") | |
} | |
} | |
} | |
pub mod codec { | |
pub mod ascii { | |
use std::str; | |
use util::StrCharIndex; | |
use types::*; | |
pub struct ASCII; | |
pub struct ASCIIEncoder; | |
impl Encoder for ASCIIEncoder { | |
pub fn encoding(&self) -> ~Encoding { ~ASCII as ~Encoding } | |
pub fn feed<'r>(&mut self, input: &'r str) -> (~[u8],Option<CodecError<&'r str,~str>>) { | |
let mut ret = ~[]; | |
let mut err = None; | |
for input.index_iter().advance |(_, j, ch)| { | |
if ch <= '\u007f' { | |
ret.push(ch as u8); | |
} else { | |
err = Some(CodecError { | |
remaining: input.slice_from(j), | |
problem: str::from_char(ch), | |
cause: ~"unrepresentable character", | |
}); | |
break; | |
} | |
} | |
(ret, err) | |
} | |
pub fn flush(~self) -> (~[u8],Option<CodecError<(),~str>>) { | |
(~[], None) | |
} | |
} | |
pub struct ASCIIDecoder; | |
impl Decoder for ASCIIDecoder { | |
pub fn encoding(&self) -> ~Encoding { ~ASCII as ~Encoding } | |
pub fn feed<'r>(&mut self, input: &'r [u8]) -> (~str,Option<CodecError<&'r [u8],~[u8]>>) { | |
let mut ret = ~""; | |
let mut i = 0; | |
let len = input.len(); | |
while i < len { | |
if input[i] <= 0x7f { | |
ret.push_char(input[i] as char); | |
} else { | |
return (ret, Some(CodecError { | |
remaining: input.slice(i+1, input.len()), | |
problem: ~[input[i]], | |
cause: ~"invalid sequence", | |
})); | |
} | |
i += 1; | |
} | |
(ret, None) | |
} | |
pub fn flush(~self) -> (~str,Option<CodecError<(),~[u8]>>) { | |
(~"", None) | |
} | |
} | |
impl Encoding for ASCII { | |
pub fn name(&self) -> ~str { ~"ascii" } | |
pub fn encoder(&self) -> ~Encoder { ~ASCIIEncoder as ~Encoder } | |
pub fn decoder(&self) -> ~Decoder { ~ASCIIDecoder as ~Decoder } | |
pub fn preferred_replacement_seq(&self) -> ~[u8] { ~[0x3f] /* "?" */ } | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::ASCII; | |
use types::*; | |
fn strip_cause<T,Remaining,Problem>(result: (T,Option<CodecError<Remaining,Problem>>)) | |
-> (T,Option<(Remaining,Problem)>) { | |
match result { | |
(processed, None) => (processed, None), | |
(processed, Some(CodecError { remaining, problem, cause: _cause })) => | |
(processed, Some((remaining, problem))) | |
} | |
} | |
macro_rules! assert_result( | |
($lhs:expr, $rhs:expr) => (assert_eq!(strip_cause($lhs), $rhs)) | |
) | |
#[test] | |
fn test_encoder() { | |
let mut e = ASCII.encoder(); | |
assert_result!(e.feed("A"), (~[0x41], None)); | |
assert_result!(e.feed("BC"), (~[0x42, 0x43], None)); | |
assert_result!(e.feed(""), (~[], None)); | |
assert_result!(e.feed("\xa0"), (~[], Some(("", ~"\xa0")))); | |
assert_result!(e.flush(), (~[], None)); | |
} | |
#[test] | |
fn test_decoder() { | |
let mut d = ASCII.decoder(); | |
assert_result!(d.feed(&[0x41]), (~"A", None)); | |
assert_result!(d.feed(&[0x42, 0x43]), (~"BC", None)); | |
assert_result!(d.feed(&[]), (~"", None)); | |
assert_result!(d.feed(&[0xa0]), (~"", Some((&[], ~[0xa0])))); | |
assert_result!(d.flush(), (~"", None)); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment