Skip to content

Instantly share code, notes, and snippets.

@lifthrasiir
Created July 27, 2013 20:01
Show Gist options
  • Save lifthrasiir/6096084 to your computer and use it in GitHub Desktop.
Save lifthrasiir/6096084 to your computer and use it in GitHub Desktop.
Character encoding interface for Rust (proof-of-concept)
mod util {
use std::str::CharRange;
pub struct StrCharIndexIterator<'self> {
priv index: uint,
priv string: &'self str,
}
impl<'self> Iterator<(uint, uint, char)> for StrCharIndexIterator<'self> {
#[inline]
fn next(&mut self) -> Option<(uint, uint, char)> {
if self.index < self.string.len() {
let CharRange {ch, next} = self.string.char_range_at(self.index);
let index = self.index;
self.index = next;
Some((index, next, ch))
} else {
None
}
}
}
pub trait StrCharIndex<'self> {
fn index_iter(&self) -> StrCharIndexIterator<'self>;
}
impl<'self> StrCharIndex<'self> for &'self str {
fn index_iter(&self) -> StrCharIndexIterator<'self> {
StrCharIndexIterator { index: 0, string: *self }
}
}
}
pub mod types {
pub struct CodecError<Remaining,Problem> {
remaining: Remaining,
problem: Problem,
cause: ~str,
}
pub trait Encoder {
pub fn encoding(&self) -> ~Encoding;
pub fn feed<'r>(&mut self, input: &'r str) -> (~[u8],Option<CodecError<&'r str,~str>>);
pub fn flush(~self) -> (~[u8],Option<CodecError<(),~str>>);
}
pub trait Decoder {
pub fn encoding(&self) -> ~Encoding;
pub fn feed<'r>(&mut self, input: &'r [u8]) -> (~str,Option<CodecError<&'r [u8],~[u8]>>);
pub fn flush(~self) -> (~str,Option<CodecError<(),~[u8]>>);
}
pub trait Encoding {
pub fn name(&self) -> ~str;
pub fn encoder(&self) -> ~Encoder;
pub fn decoder(&self) -> ~Decoder;
pub fn preferred_replacement_seq(&self) -> ~[u8];
}
pub trait EncodingUtil<T:Encoding> {
pub fn encode<Trap:EncoderTrap<T>>(&self, input: &str, trap: Trap) -> Result<~[u8],~str>;
pub fn decode<Trap:DecoderTrap<T>>(&self, input: &[u8], trap: Trap) -> Result<~str,~str>;
}
impl<T:Encoding> EncodingUtil<T> for T {
#[inline]
pub fn encode<Trap:EncoderTrap<T>>(&self, input: &str, mut trap: Trap) -> Result<~[u8],~str> {
let mut encoder = self.encoder();
let mut remaining = input;
let mut ret = ~[];
loop {
let (encoded, err) = encoder.feed(remaining);
ret.push_all(encoded);
match err {
Some(err) => {
match trap.encoder_trap(self, err.problem) {
Some(s) => { ret.push_all(s); }
None => { return Err(err.cause); }
}
remaining = err.remaining;
}
None => break
}
}
let (encoded, err) = encoder.flush();
ret.push_all(encoded);
match err {
Some(err) => {
match trap.encoder_trap(self, err.problem) {
Some(s) => { ret.push_all(s); }
None => { return Err(err.cause); }
}
}
None => {}
}
Ok(ret)
}
pub fn decode<Trap:DecoderTrap<T>>(&self, input: &[u8], mut trap: Trap) -> Result<~str,~str> {
let mut decoder = self.decoder();
let mut remaining = input;
let mut ret = ~"";
loop {
let (decoded, err) = decoder.feed(remaining);
ret.push_str(decoded);
match err {
Some(err) => {
match trap.decoder_trap(self, err.problem) {
Some(s) => { ret.push_str(s); }
None => { return Err(err.cause); }
}
remaining = err.remaining;
}
None => break
}
}
let (decoded, err) = decoder.flush();
ret.push_str(decoded);
match err {
Some(err) => {
match trap.decoder_trap(self, err.problem) {
Some(s) => { ret.push_str(s); }
None => { return Err(err.cause); }
}
}
None => {}
}
Ok(ret)
}
}
pub trait EncoderTrap<T:Encoding> {
pub fn encoder_trap(&mut self, encoding: &T, input: &str) -> Option<~[u8]>;
}
pub trait DecoderTrap<T:Encoding> {
pub fn decoder_trap(&mut self, encoding: &T, input: &[u8]) -> Option<~str>;
}
impl<'self,T:Encoding> EncoderTrap<T> for &'self fn(&T,&str) -> Option<~[u8]> {
pub fn encoder_trap(&mut self, encoding: &T, input: &str) -> Option<~[u8]> {
(*self)(encoding, input)
}
}
impl<'self,T:Encoding> DecoderTrap<T> for &'self fn(&T,&[u8]) -> Option<~str> {
pub fn decoder_trap(&mut self, encoding: &T, input: &[u8]) -> Option<~str> {
(*self)(encoding, input)
}
}
}
pub mod trap {
use types::*;
pub struct Strict;
impl<T:Encoding> EncoderTrap<T> for Strict {
#[inline]
pub fn encoder_trap(&mut self, _encoding: &T, _input: &str) -> Option<~[u8]> {
None
}
}
impl<T:Encoding> DecoderTrap<T> for Strict {
#[inline]
pub fn decoder_trap(&mut self, _encoding: &T, _input: &[u8]) -> Option<~str> {
None
}
}
pub struct Replace;
impl<T:Encoding> EncoderTrap<T> for Replace {
#[inline]
pub fn encoder_trap(&mut self, encoding: &T, _input: &str) -> Option<~[u8]> {
Some(encoding.preferred_replacement_seq())
}
}
impl<T:Encoding> DecoderTrap<T> for Replace {
#[inline]
pub fn decoder_trap(&mut self, _encoding: &T, _input: &[u8]) -> Option<~str> {
Some(~"\ufffd")
}
}
pub struct Ignore;
impl<T:Encoding> EncoderTrap<T> for Ignore {
#[inline]
pub fn encoder_trap(&mut self, _encoding: &T, _input: &str) -> Option<~[u8]> {
Some(~[])
}
}
impl<T:Encoding> DecoderTrap<T> for Ignore {
#[inline]
pub fn decoder_trap(&mut self, _encoding: &T, _input: &[u8]) -> Option<~str> {
Some(~"")
}
}
}
pub mod codec {
pub mod ascii {
use std::str;
use util::StrCharIndex;
use types::*;
pub struct ASCII;
pub struct ASCIIEncoder;
impl Encoder for ASCIIEncoder {
pub fn encoding(&self) -> ~Encoding { ~ASCII as ~Encoding }
pub fn feed<'r>(&mut self, input: &'r str) -> (~[u8],Option<CodecError<&'r str,~str>>) {
let mut ret = ~[];
let mut err = None;
for input.index_iter().advance |(_, j, ch)| {
if ch <= '\u007f' {
ret.push(ch as u8);
} else {
err = Some(CodecError {
remaining: input.slice_from(j),
problem: str::from_char(ch),
cause: ~"unrepresentable character",
});
break;
}
}
(ret, err)
}
pub fn flush(~self) -> (~[u8],Option<CodecError<(),~str>>) {
(~[], None)
}
}
pub struct ASCIIDecoder;
impl Decoder for ASCIIDecoder {
pub fn encoding(&self) -> ~Encoding { ~ASCII as ~Encoding }
pub fn feed<'r>(&mut self, input: &'r [u8]) -> (~str,Option<CodecError<&'r [u8],~[u8]>>) {
let mut ret = ~"";
let mut i = 0;
let len = input.len();
while i < len {
if input[i] <= 0x7f {
ret.push_char(input[i] as char);
} else {
return (ret, Some(CodecError {
remaining: input.slice(i+1, input.len()),
problem: ~[input[i]],
cause: ~"invalid sequence",
}));
}
i += 1;
}
(ret, None)
}
pub fn flush(~self) -> (~str,Option<CodecError<(),~[u8]>>) {
(~"", None)
}
}
impl Encoding for ASCII {
pub fn name(&self) -> ~str { ~"ascii" }
pub fn encoder(&self) -> ~Encoder { ~ASCIIEncoder as ~Encoder }
pub fn decoder(&self) -> ~Decoder { ~ASCIIDecoder as ~Decoder }
pub fn preferred_replacement_seq(&self) -> ~[u8] { ~[0x3f] /* "?" */ }
}
#[cfg(test)]
mod tests {
use super::ASCII;
use types::*;
fn strip_cause<T,Remaining,Problem>(result: (T,Option<CodecError<Remaining,Problem>>))
-> (T,Option<(Remaining,Problem)>) {
match result {
(processed, None) => (processed, None),
(processed, Some(CodecError { remaining, problem, cause: _cause })) =>
(processed, Some((remaining, problem)))
}
}
macro_rules! assert_result(
($lhs:expr, $rhs:expr) => (assert_eq!(strip_cause($lhs), $rhs))
)
#[test]
fn test_encoder() {
let mut e = ASCII.encoder();
assert_result!(e.feed("A"), (~[0x41], None));
assert_result!(e.feed("BC"), (~[0x42, 0x43], None));
assert_result!(e.feed(""), (~[], None));
assert_result!(e.feed("\xa0"), (~[], Some(("", ~"\xa0"))));
assert_result!(e.flush(), (~[], None));
}
#[test]
fn test_decoder() {
let mut d = ASCII.decoder();
assert_result!(d.feed(&[0x41]), (~"A", None));
assert_result!(d.feed(&[0x42, 0x43]), (~"BC", None));
assert_result!(d.feed(&[]), (~"", None));
assert_result!(d.feed(&[0xa0]), (~"", Some((&[], ~[0xa0]))));
assert_result!(d.flush(), (~"", None));
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment