Skip to content

Instantly share code, notes, and snippets.

@pzol
Created February 27, 2014 08:59
Show Gist options
  • Save pzol/9246690 to your computer and use it in GitHub Desktop.
Save pzol/9246690 to your computer and use it in GitHub Desktop.
#[feature(macro_rules)];
struct Grapheme<'r> {
codepoint: &'r [CodePoint]
}
struct Graphemes<'r> {
chars: std::str::Chars<'r>
}
impl<'r> Iterator<Grapheme<'r>> for Graphemes<'r> {
fn next(&mut self) -> Option<Grapheme> {
None
}
}
trait Unicode {
fn graphemes<'r>(&'r self) -> Graphemes<'r>;
}
impl<'r> Unicode for &'r str {
fn graphemes<'r>(&'r self) -> Graphemes<'r> {
Graphemes { chars: self.chars() }
}
}
pub struct CodePoint {
code: char,
/// These names match exactly the names published in the code charts of the Unicode Standard
name: &'static str,
/// This is a useful breakdown into various character types
/// which can be used as a default categorization in implementations.
general_category: GeneralCategory,
/// http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
canonical_combining_class: &'static str,
/// http://www.unicode.org/reports/tr44/#Bidi_Class_Values
bidi_class: &'static str,
/// http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
decomposition_type: &'static str,
decomposition_mapping: &'static str,
numeric_type: &'static str,
numeric_value: &'static str,
bidi_mirrored: &'static str,
// old_name: ~str,
// comment: ~str,
upper: char,
lower: char,
title: char
}
macro_rules! cp(
($code:expr;
$name:expr;
$general_category:expr;
$canonical_combining_class:expr;
$bidi_class:expr;
$decomposition_type:expr;
$decomposition_mapping:expr;
$numeric_type:expr;
$numeric_value:expr;
$bidi_mirrored:expr;
$upper: expr;
$lower: expr;
$title: expr
) => (
CodePoint {
code: $code,
name: $name,
general_category: $general_category,
canonical_combining_class: $canonical_combining_class,
bidi_class: $bidi_class,
decomposition_type: $decomposition_type,
decomposition_mapping: $decomposition_mapping,
numeric_type: $numeric_type,
numeric_value: $numeric_value,
bidi_mirrored: $bidi_mirrored,
upper: $upper,
lower: $lower,
title: $title
}
);
)
impl CodePoint {
fn is_upper(&self) -> bool {
self.general_category == Lu
}
fn is_lower(&self) -> bool {
self.general_category == Ll
}
// fn is_alphabetic(&self) -> bool;
// fn is_XID_start(&self) -> bool;
// fn is_XID_continue(&self) -> bool;
// fn is_whitespace(&self) -> bool;
// fn is_alphanumeric(&self) -> bool;
// fn is_control(&self) -> bool;
// fn is_digit(&self) -> bool;
// fn to_lower(&self) -> char;
// fn to_upper(&self) -> char;
// fn escape_unicode(&self, f: |char|);
// fn escape_default(&self, f: |char|);
}
fn codepoint(c: char) -> Option<CodePoint> {
use std::cmp::{Equal, Less, Greater};
unicode_data.bsearch(|&cp| {
if c == cp.code { Equal}
else if c < cp.code { Less }
else { Greater }
}).map(|index| {
unicode_data[index]
})
}
// ;"Lu";"0";"L";"";"";"";"";"N";"";"";"";"0061";
pub static unicode_data : &'static [CodePoint] = &[
cp!('\u0041';"LATIN CAPITAL LETTER A";Lu;"0";"L";"";"";"";"";"N";'\u0041';'\u0061';'\u0041')
];
/// This is a useful breakdown into various character types
/// which can be used as a default categorization in implementations
///
/// Reference: http://www.unicode.org/reports/tr44/#General_Category_Values
#[deriving(Eq)]
pub enum GeneralCategory {
Lu, // Uppercase_Letter: an uppercase letter
Ll, // Lowercase_Letter: a lowercase letter
Lt, // Titlecase_Letter: a digraphic character, with first part uppercase
LC,
Lm,
Lo,
L
}
#[cfg(test)]
mod tests {
use super::codepoint;
#[test]
fn test_codepoint(){
let capital_a = codepoint('A').unwrap();
debug!("{:?}", capital_a);
assert_eq!(capital_a.name, "LATIN CAPITAL LETTER A");
assert_eq!(capital_a.lower, 'a');
assert_eq!(capital_a.title, 'A');
assert!(capital_a.is_upper());
assert!(!capital_a.is_lower());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment