Last active
August 29, 2015 14:08
-
-
Save japaric/6c5544455d3e8a44fe19 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(macro_rules)] | |
#![no_implicit_prelude] | |
use std::iter::Iterator; | |
use std::option::{None, Option, Some}; | |
use std::slice::ImmutableSlice; | |
use std::{iter, slice}; | |
pub struct Str([u8]); | |
/// External iterator for a string's bytes. | |
/// Use with the `std::iter` module. | |
pub type Bytes<'a> = iter::Map<'a, &'a u8, u8, slice::Items<'a, u8>>; | |
// Free functions | |
impl Str { | |
/// Converts a vector to a string slice without performing any allocations. | |
/// | |
/// Once the slice has been validated as utf-8, it is transmuted in-place and | |
/// returned as a '&str' instead of a '&[u8]' | |
/// | |
/// Returns None if the slice is not utf-8. | |
pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a Str> { | |
if is_utf8(v) { | |
Some(unsafe { raw::from_utf8(v) }) | |
} else { None } | |
} | |
} | |
mod raw { | |
use super::Str; | |
use std::mem; | |
/// Converts a slice of bytes to a string slice without checking | |
/// that the string contains valid UTF-8. | |
pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a Str { | |
mem::transmute(v) | |
} | |
} | |
fn main() { | |
let msg = "Hello World!"; | |
let bytes = { | |
use std::str::StrSlice; | |
msg.as_bytes() | |
}; | |
let str = Str::from_utf8(bytes); | |
// ICE trigger | |
let _ = str.is_some(); | |
} | |
// Copied verbatim from `core::str` | |
/// Mask of the value bits of a continuation byte | |
const CONT_MASK: u8 = 0b0011_1111u8; | |
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte | |
const TAG_CONT_U8: u8 = 0b1000_0000u8; | |
// https://tools.ietf.org/html/rfc3629 | |
static UTF8_CHAR_WIDTH: [u8, ..256] = [ | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF | |
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF | |
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF | |
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF | |
]; | |
/// Given a first byte, determine how many bytes are in this UTF-8 character | |
#[inline] | |
pub fn utf8_char_width(b: u8) -> uint { | |
return UTF8_CHAR_WIDTH[b as uint] as uint; | |
} | |
/// Determines if a vector of bytes contains valid UTF-8. | |
pub fn is_utf8(v: &[u8]) -> bool { | |
run_utf8_validation_iterator(&mut v.iter()) | |
} | |
/// Walk through `iter` checking that it's a valid UTF-8 sequence, | |
/// returning `true` in that case, or, if it is invalid, `false` with | |
/// `iter` reset such that it is pointing at the first byte in the | |
/// invalid sequence. | |
#[inline(always)] | |
fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool { | |
loop { | |
// save the current thing we're pointing at. | |
let old = *iter; | |
// restore the iterator we had at the start of this codepoint. | |
macro_rules! err ( () => { {*iter = old; return false} }); | |
macro_rules! next ( () => { | |
match iter.next() { | |
Some(a) => *a, | |
// we needed data, but there was none: error! | |
None => err!() | |
} | |
}); | |
let first = match iter.next() { | |
Some(&b) => b, | |
// we're at the end of the iterator and a codepoint | |
// boundary at the same time, so this string is valid. | |
None => return true | |
}; | |
// ASCII characters are always valid, so only large | |
// bytes need more examination. | |
if first >= 128 { | |
let w = utf8_char_width(first); | |
let second = next!(); | |
// 2-byte encoding is for codepoints \u0080 to \u07ff | |
// first C2 80 last DF BF | |
// 3-byte encoding is for codepoints \u0800 to \uffff | |
// first E0 A0 80 last EF BF BF | |
// excluding surrogates codepoints \ud800 to \udfff | |
// ED A0 80 to ED BF BF | |
// 4-byte encoding is for codepoints \u10000 to \u10ffff | |
// first F0 90 80 80 last F4 8F BF BF | |
// | |
// Use the UTF-8 syntax from the RFC | |
// | |
// https://tools.ietf.org/html/rfc3629 | |
// UTF8-1 = %x00-7F | |
// UTF8-2 = %xC2-DF UTF8-tail | |
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / | |
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) | |
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / | |
// %xF4 %x80-8F 2( UTF8-tail ) | |
match w { | |
2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()}, | |
3 => { | |
match (first, second, next!() & !CONT_MASK) { | |
(0xE0 , 0xA0 ... 0xBF, TAG_CONT_U8) | | |
(0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) | | |
(0xED , 0x80 ... 0x9F, TAG_CONT_U8) | | |
(0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {} | |
_ => err!() | |
} | |
} | |
4 => { | |
match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) { | |
(0xF0 , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) | | |
(0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) | | |
(0xF4 , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {} | |
_ => err!() | |
} | |
} | |
_ => err!() | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment