Last active
September 15, 2016 22:08
-
-
Save QuietMisdreavus/5b1644443b9a658203a19efb15a2b5e3 to your computer and use it in GitHub Desktop.
rust's regex crate versus twitter-text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Rust code to compare the regex crate's version of these unicode categories | |
//to Ruby/Java/Objective-C's versions as collected by Twitter | |
//https://github.com/twitter/twitter-text/tree/master/unicode_regex | |
extern crate regex; | |
fn main() { | |
let mut decimal = Vec::new(); | |
let mut marks_letters = Vec::new(); | |
let decimal_pat = regex::Regex::new(r"\p{Nd}").unwrap(); | |
let marks_letters_pat = regex::Regex::new(r"[\p{L}\p{M}]").unwrap(); | |
let mut temp = String::with_capacity(4); | |
for x in (0u32..0xD800).chain(0xE000u32..0x110000) { | |
temp.push(std::char::from_u32(x).unwrap()); | |
if decimal_pat.is_match(&temp) { | |
decimal.push(x); | |
} | |
if marks_letters_pat.is_match(&temp) { | |
marks_letters.push(x); | |
} | |
temp.clear(); | |
} | |
//prints 550 codepoints versus objc's 540 | |
//rendered version available at https://shiva.icesoldier.me/rust_twitter_text/decimal_numbers_rust.txt | |
print_matches(&decimal, decimal_pat.as_str()); | |
//prints 111554 codepoints versus objc's 104623 | |
//rendered version available at https://shiva.icesoldier.me/rust_twitter_text/letters_and_marks_rust.txt | |
print_matches(&marks_letters, marks_letters_pat.as_str()); | |
} | |
fn print_matches(input: &[u32], pattern: &str) { | |
println!("# {} code points matched for {}", input.len(), pattern); | |
for x in input { | |
println!("{}", x); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment