pzol · February 27, 2014 08:59
diff --git a/unicode_graphemes.rs b/unicode_graphemes.rs
 #[feature(macro_rules)];

 struct Grapheme<'r> {
  codepoint: &'r [CodePoint]
 }

 struct Graphemes<'r> {
  chars: std::str::Chars<'r>
 }

 impl<'r> Iterator<Grapheme<'r>> for Graphemes<'r> {
  fn next(&mut self) -> Option<Grapheme> {
    None
  }
 }

 trait Unicode {
  fn graphemes<'r>(&'r self) -> Graphemes<'r>;
 }

 impl<'r> Unicode for &'r str {
  fn graphemes<'r>(&'r self) -> Graphemes<'r> {
    Graphemes { chars: self.chars() }
  }
 }

 pub struct CodePoint {
  code: char,
  /// These names match exactly the names published in the code charts of the Unicode Standard
  name: &'static str,
  /// This is a useful breakdown into various character types
  /// which can be used as a default categorization in implementations.
  general_category: GeneralCategory,
  /// http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
  canonical_combining_class: &'static str,
  /// http://www.unicode.org/reports/tr44/#Bidi_Class_Values
  bidi_class: &'static str,
  /// http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
  decomposition_type: &'static str,
  decomposition_mapping: &'static str,

  numeric_type: &'static str,
  numeric_value: &'static str,

  bidi_mirrored: &'static str,
  // old_name: ~str,
  // comment: ~str,
  upper: char,
  lower: char,
  title: char
 }


 macro_rules! cp(
    ($code:expr;
     $name:expr;
     $general_category:expr;
     $canonical_combining_class:expr;
     $bidi_class:expr;
     $decomposition_type:expr;
     $decomposition_mapping:expr;
     $numeric_type:expr;
     $numeric_value:expr;
     $bidi_mirrored:expr;
     $upper: expr;
     $lower: expr;
     $title: expr
     ) => (
      CodePoint {
        code: $code,
        name: $name,
        general_category: $general_category,
        canonical_combining_class: $canonical_combining_class,
        bidi_class: $bidi_class,
        decomposition_type: $decomposition_type,
        decomposition_mapping: $decomposition_mapping,
        numeric_type: $numeric_type,
        numeric_value: $numeric_value,
        bidi_mirrored: $bidi_mirrored,
        upper: $upper,
        lower: $lower,
        title: $title
      }
    );
 )

 impl CodePoint {
  fn is_upper(&self) -> bool {
    self.general_category == Lu
  }

  fn is_lower(&self) -> bool {
    self.general_category == Ll
  }

  // fn is_alphabetic(&self) -> bool;
  // fn is_XID_start(&self) -> bool;
  // fn is_XID_continue(&self) -> bool;
  // fn is_whitespace(&self) -> bool;
  // fn is_alphanumeric(&self) -> bool;
  // fn is_control(&self) -> bool;
  // fn is_digit(&self) -> bool;
  // fn to_lower(&self) -> char;
  // fn to_upper(&self) -> char;
  // fn escape_unicode(&self, f: |char|);
  // fn escape_default(&self, f: |char|);
 }

 fn codepoint(c: char) -> Option<CodePoint> {
  use std::cmp::{Equal, Less, Greater};
  unicode_data.bsearch(|&cp| {
    if c == cp.code { Equal}
    else if c < cp.code { Less }
    else { Greater }
  }).map(|index| {
    unicode_data[index]
  })
 }

 // ;"Lu";"0";"L";"";"";"";"";"N";"";"";"";"0061";
 pub static unicode_data : &'static [CodePoint] = &[
  cp!('\u0041';"LATIN CAPITAL LETTER A";Lu;"0";"L";"";"";"";"";"N";'\u0041';'\u0061';'\u0041')
 ];

 ///  This is a useful breakdown into various character types
 ///  which can be used as a default categorization in implementations
 ///
 ///  Reference: http://www.unicode.org/reports/tr44/#General_Category_Values
 #[deriving(Eq)]
 pub enum GeneralCategory {
  Lu, // Uppercase_Letter: an uppercase letter
  Ll, // Lowercase_Letter: a lowercase letter
  Lt, // Titlecase_Letter: a digraphic character, with first part uppercase
  LC,
  Lm,
  Lo,
  L
 }

 #[cfg(test)]
 mod tests {
  use super::codepoint;

  #[test]
  fn test_codepoint(){
    let capital_a = codepoint('A').unwrap();
    debug!("{:?}", capital_a);
    assert_eq!(capital_a.name, "LATIN CAPITAL LETTER A");
    assert_eq!(capital_a.lower, 'a');
    assert_eq!(capital_a.title, 'A');
    assert!(capital_a.is_upper());
    assert!(!capital_a.is_lower());
  }

 }
	#[feature(macro_rules)];

	struct Grapheme<'r> {
	codepoint: &'r [CodePoint]
	}

	struct Graphemes<'r> {
	chars: std::str::Chars<'r>
	}

	impl<'r> Iterator<Grapheme<'r>> for Graphemes<'r> {
	fn next(&mut self) -> Option<Grapheme> {
	None
	}
	}

	trait Unicode {
	fn graphemes<'r>(&'r self) -> Graphemes<'r>;
	}

	impl<'r> Unicode for &'r str {
	fn graphemes<'r>(&'r self) -> Graphemes<'r> {
	Graphemes { chars: self.chars() }
	}
	}

	pub struct CodePoint {
	code: char,
	/// These names match exactly the names published in the code charts of the Unicode Standard
	name: &'static str,
	/// This is a useful breakdown into various character types
	/// which can be used as a default categorization in implementations.
	general_category: GeneralCategory,
	/// http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
	canonical_combining_class: &'static str,
	/// http://www.unicode.org/reports/tr44/#Bidi_Class_Values
	bidi_class: &'static str,
	/// http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
	decomposition_type: &'static str,
	decomposition_mapping: &'static str,

	numeric_type: &'static str,
	numeric_value: &'static str,

	bidi_mirrored: &'static str,
	// old_name: ~str,
	// comment: ~str,
	upper: char,
	lower: char,
	title: char
	}


	macro_rules! cp(
	($code:expr;
	$name:expr;
	$general_category:expr;
	$canonical_combining_class:expr;
	$bidi_class:expr;
	$decomposition_type:expr;
	$decomposition_mapping:expr;
	$numeric_type:expr;
	$numeric_value:expr;
	$bidi_mirrored:expr;
	$upper: expr;
	$lower: expr;
	$title: expr
	) => (
	CodePoint {
	code: $code,
	name: $name,
	general_category: $general_category,
	canonical_combining_class: $canonical_combining_class,
	bidi_class: $bidi_class,
	decomposition_type: $decomposition_type,
	decomposition_mapping: $decomposition_mapping,
	numeric_type: $numeric_type,
	numeric_value: $numeric_value,
	bidi_mirrored: $bidi_mirrored,
	upper: $upper,
	lower: $lower,
	title: $title
	}
	);
	)

	impl CodePoint {
	fn is_upper(&self) -> bool {
	self.general_category == Lu
	}

	fn is_lower(&self) -> bool {
	self.general_category == Ll
	}

	// fn is_alphabetic(&self) -> bool;
	// fn is_XID_start(&self) -> bool;
	// fn is_XID_continue(&self) -> bool;
	// fn is_whitespace(&self) -> bool;
	// fn is_alphanumeric(&self) -> bool;
	// fn is_control(&self) -> bool;
	// fn is_digit(&self) -> bool;
	// fn to_lower(&self) -> char;
	// fn to_upper(&self) -> char;
	// fn escape_unicode(&self, f: \|char\|);
	// fn escape_default(&self, f: \|char\|);
	}

	fn codepoint(c: char) -> Option<CodePoint> {
	use std::cmp::{Equal, Less, Greater};
	unicode_data.bsearch(\|&cp\| {
	if c == cp.code { Equal}
	else if c < cp.code { Less }
	else { Greater }
	}).map(\|index\| {
	unicode_data[index]
	})
	}

	// ;"Lu";"0";"L";"";"";"";"";"N";"";"";"";"0061";
	pub static unicode_data : &'static [CodePoint] = &[
	cp!('\u0041';"LATIN CAPITAL LETTER A";Lu;"0";"L";"";"";"";"";"N";'\u0041';'\u0061';'\u0041')
	];

	/// This is a useful breakdown into various character types
	/// which can be used as a default categorization in implementations
	///
	/// Reference: http://www.unicode.org/reports/tr44/#General_Category_Values
	#[deriving(Eq)]
	pub enum GeneralCategory {
	Lu, // Uppercase_Letter: an uppercase letter
	Ll, // Lowercase_Letter: a lowercase letter
	Lt, // Titlecase_Letter: a digraphic character, with first part uppercase
	LC,
	Lm,
	Lo,
	L
	}

	#[cfg(test)]
	mod tests {
	use super::codepoint;

	#[test]
	fn test_codepoint(){
	let capital_a = codepoint('A').unwrap();
	debug!("{:?}", capital_a);
	assert_eq!(capital_a.name, "LATIN CAPITAL LETTER A");
	assert_eq!(capital_a.lower, 'a');
	assert_eq!(capital_a.title, 'A');
	assert!(capital_a.is_upper());
	assert!(!capital_a.is_lower());
	}

	}