Skip to content

Instantly share code, notes, and snippets.

@QuietMisdreavus
Last active September 15, 2016 22:08
Show Gist options
  • Save QuietMisdreavus/5b1644443b9a658203a19efb15a2b5e3 to your computer and use it in GitHub Desktop.
Save QuietMisdreavus/5b1644443b9a658203a19efb15a2b5e3 to your computer and use it in GitHub Desktop.
rust's regex crate versus twitter-text
//Rust code to compare the regex crate's version of these unicode categories
//to Ruby/Java/Objective-C's versions as collected by Twitter
//https://github.com/twitter/twitter-text/tree/master/unicode_regex
extern crate regex;
fn main() {
let mut decimal = Vec::new();
let mut marks_letters = Vec::new();
let decimal_pat = regex::Regex::new(r"\p{Nd}").unwrap();
let marks_letters_pat = regex::Regex::new(r"[\p{L}\p{M}]").unwrap();
let mut temp = String::with_capacity(4);
for x in (0u32..0xD800).chain(0xE000u32..0x110000) {
temp.push(std::char::from_u32(x).unwrap());
if decimal_pat.is_match(&temp) {
decimal.push(x);
}
if marks_letters_pat.is_match(&temp) {
marks_letters.push(x);
}
temp.clear();
}
//prints 550 codepoints versus objc's 540
//rendered version available at https://shiva.icesoldier.me/rust_twitter_text/decimal_numbers_rust.txt
print_matches(&decimal, decimal_pat.as_str());
//prints 111554 codepoints versus objc's 104623
//rendered version available at https://shiva.icesoldier.me/rust_twitter_text/letters_and_marks_rust.txt
print_matches(&marks_letters, marks_letters_pat.as_str());
}
fn print_matches(input: &[u32], pattern: &str) {
println!("# {} code points matched for {}", input.len(), pattern);
for x in input {
println!("{}", x);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment