Created
February 27, 2014 08:59
-
-
Save pzol/9246690 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#[feature(macro_rules)]; | |
struct Grapheme<'r> { | |
codepoint: &'r [CodePoint] | |
} | |
struct Graphemes<'r> { | |
chars: std::str::Chars<'r> | |
} | |
impl<'r> Iterator<Grapheme<'r>> for Graphemes<'r> { | |
fn next(&mut self) -> Option<Grapheme> { | |
None | |
} | |
} | |
trait Unicode { | |
fn graphemes<'r>(&'r self) -> Graphemes<'r>; | |
} | |
impl<'r> Unicode for &'r str { | |
fn graphemes<'r>(&'r self) -> Graphemes<'r> { | |
Graphemes { chars: self.chars() } | |
} | |
} | |
pub struct CodePoint { | |
code: char, | |
/// These names match exactly the names published in the code charts of the Unicode Standard | |
name: &'static str, | |
/// This is a useful breakdown into various character types | |
/// which can be used as a default categorization in implementations. | |
general_category: GeneralCategory, | |
/// http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values | |
canonical_combining_class: &'static str, | |
/// http://www.unicode.org/reports/tr44/#Bidi_Class_Values | |
bidi_class: &'static str, | |
/// http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings | |
decomposition_type: &'static str, | |
decomposition_mapping: &'static str, | |
numeric_type: &'static str, | |
numeric_value: &'static str, | |
bidi_mirrored: &'static str, | |
// old_name: ~str, | |
// comment: ~str, | |
upper: char, | |
lower: char, | |
title: char | |
} | |
macro_rules! cp( | |
($code:expr; | |
$name:expr; | |
$general_category:expr; | |
$canonical_combining_class:expr; | |
$bidi_class:expr; | |
$decomposition_type:expr; | |
$decomposition_mapping:expr; | |
$numeric_type:expr; | |
$numeric_value:expr; | |
$bidi_mirrored:expr; | |
$upper: expr; | |
$lower: expr; | |
$title: expr | |
) => ( | |
CodePoint { | |
code: $code, | |
name: $name, | |
general_category: $general_category, | |
canonical_combining_class: $canonical_combining_class, | |
bidi_class: $bidi_class, | |
decomposition_type: $decomposition_type, | |
decomposition_mapping: $decomposition_mapping, | |
numeric_type: $numeric_type, | |
numeric_value: $numeric_value, | |
bidi_mirrored: $bidi_mirrored, | |
upper: $upper, | |
lower: $lower, | |
title: $title | |
} | |
); | |
) | |
impl CodePoint { | |
fn is_upper(&self) -> bool { | |
self.general_category == Lu | |
} | |
fn is_lower(&self) -> bool { | |
self.general_category == Ll | |
} | |
// fn is_alphabetic(&self) -> bool; | |
// fn is_XID_start(&self) -> bool; | |
// fn is_XID_continue(&self) -> bool; | |
// fn is_whitespace(&self) -> bool; | |
// fn is_alphanumeric(&self) -> bool; | |
// fn is_control(&self) -> bool; | |
// fn is_digit(&self) -> bool; | |
// fn to_lower(&self) -> char; | |
// fn to_upper(&self) -> char; | |
// fn escape_unicode(&self, f: |char|); | |
// fn escape_default(&self, f: |char|); | |
} | |
fn codepoint(c: char) -> Option<CodePoint> { | |
use std::cmp::{Equal, Less, Greater}; | |
unicode_data.bsearch(|&cp| { | |
if c == cp.code { Equal} | |
else if c < cp.code { Less } | |
else { Greater } | |
}).map(|index| { | |
unicode_data[index] | |
}) | |
} | |
// ;"Lu";"0";"L";"";"";"";"";"N";"";"";"";"0061"; | |
pub static unicode_data : &'static [CodePoint] = &[ | |
cp!('\u0041';"LATIN CAPITAL LETTER A";Lu;"0";"L";"";"";"";"";"N";'\u0041';'\u0061';'\u0041') | |
]; | |
/// This is a useful breakdown into various character types | |
/// which can be used as a default categorization in implementations | |
/// | |
/// Reference: http://www.unicode.org/reports/tr44/#General_Category_Values | |
#[deriving(Eq)] | |
pub enum GeneralCategory { | |
Lu, // Uppercase_Letter: an uppercase letter | |
Ll, // Lowercase_Letter: a lowercase letter | |
Lt, // Titlecase_Letter: a digraphic character, with first part uppercase | |
LC, | |
Lm, | |
Lo, | |
L | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::codepoint; | |
#[test] | |
fn test_codepoint(){ | |
let capital_a = codepoint('A').unwrap(); | |
debug!("{:?}", capital_a); | |
assert_eq!(capital_a.name, "LATIN CAPITAL LETTER A"); | |
assert_eq!(capital_a.lower, 'a'); | |
assert_eq!(capital_a.title, 'A'); | |
assert!(capital_a.is_upper()); | |
assert!(!capital_a.is_lower()); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment