Last active
February 7, 2024 01:26
-
-
Save zbraniecki/9efa7e4d03d1bfb531e8b0af2010f6c8 to your computer and use it in GitHub Desktop.
Hermes ICU4X
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use widestring::U16CString; | |
use icu_locid::LanguageIdentifier; | |
fn canonicalize_utf16_locale(input: &[u16]) -> Vec<u16> { | |
let utf8_result = String::from_utf16(&input).unwrap(); | |
let loc: LanguageIdentifier = utf8_result.parse().unwrap(); | |
let utf16_data: Vec<u16> = loc.to_string().encode_utf16().collect(); | |
utf16_data | |
} | |
#[no_mangle] | |
pub extern "C" fn canonicalize_locale(input: *const u16, len: usize) -> *mut u16 { | |
let input_slice = unsafe { std::slice::from_raw_parts(input, len) }; | |
let output = canonicalize_utf16_locale(input_slice); | |
let c_string = unsafe { U16CString::from_vec_unchecked(output).into_raw() }; | |
c_string | |
} | |
#[no_mangle] | |
pub extern "C" fn free_canonicalized_locale(ptr: *mut u16) { | |
if !ptr.is_null() { | |
let _ = unsafe { U16CString::from_raw(ptr) }; // This will drop the U16CString, freeing the memory. | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://github.com/facebook/hermes/blob/main/lib/Platform/Intl/PlatformIntlApple.mm#L319-L353 | |
vm::CallResult<std::vector<std::u16string>> canonicalizeLocaleList( | |
vm::Runtime &runtime, | |
const std::vector<std::u16string> &locales) { | |
// 1. If locales is undefined, then | |
// a. Return a new empty List | |
// Not needed, this validation occurs closer to VM in 'normalizeLocales'. | |
// 2. Let seen be a new empty List. | |
std::vector<std::u16string> seen; | |
// 3. If Type(locales) is String or Type(locales) is Object and locales has an | |
// [[InitializedLocale]] internal slot, then | |
// 4. Else | |
// We don't yet support Locale object - | |
// https://402.ecma-international.org/8.0/#locale-objects As of now, 'locales' | |
// can only be a string list/array. Validation occurs in normalizeLocaleList, | |
// so this function just takes a vector of strings. | |
// 5. Let len be ? ToLength(? Get(O, "length")). | |
// 6. Let k be 0. | |
// 7. Repeat, while k < len | |
for (const auto &locale : locales) { | |
// 7.c.vi. Let canonicalizedTag be CanonicalizeUnicodeLocaleId(tag). | |
auto parsedOpt = ParsedLocaleIdentifier::parse(locale); | |
if (!parsedOpt) | |
return runtime.raiseRangeError( | |
vm::TwineChar16("Invalid language tag: ") + | |
vm::TwineChar16(locale.c_str())); | |
auto canonicalizedTag = parsedOpt->canonicalize(); | |
// 7.c.vii. If canonicalizedTag is not an element of seen, append | |
// canonicalizedTag as the last element of seen. | |
if (std::find(seen.begin(), seen.end(), canonicalizedTag) == seen.end()) { | |
seen.push_back(std::move(canonicalizedTag)); | |
} | |
} | |
return seen; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extern "C" { | |
const char16_t* canonicalize_locale(const char16_t* input, int len); | |
void free_canonicalized_locale(char16_t* ptr); | |
} | |
vm::CallResult<std::vector<std::u16string>> getCanonicalLocales( | |
vm::Runtime &runtime, | |
const std::vector<std::u16string> &locales) { | |
std::vector<std::u16string> seen; | |
for (auto& loc : locales) { | |
auto new_locale_ptr = canonicalize_locale(loc.c_str(), loc.length()); | |
std::u16string canonicalizedTag(new_locale_ptr); | |
if (std::find(seen.begin(), seen.end(), canonicalizedTag) == seen.end()) { | |
seen.push_back(std::move(canonicalizedTag)); | |
} | |
free_canonicalized_locale(const_cast<char16_t*>(new_locale_ptr)); | |
} | |
return seen; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Apple: | |
~/projects/hermes〉time ./build_release/bin/hermes ./test.js | |
1.85 real 1.82 user 0.00 sys | |
ICU4X: | |
~/projects/hermes〉time ./build_release/bin/hermes ./test.js | |
2.33 real 2.29 user 0.00 sys |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for (let i = 0; i < 1000000; i++) { | |
let result = Intl.getCanonicalLocales(["pl-pl", "de-de", "it-IT", "sr-Cyrl", "ja", "en-Latn-us", "de-at", "es-419", "und", "zh-CN"]); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Apple: 1.82
Split of ICU4X:
no-op - 0.63
Just iterating over
locales
and writing toseen
if not seen - 0.80Allocating
Vec<u16>
in Rust from the passed&[u16]
- 0.94Executing
U16CString::from_raw_unchecked().into_raw(Vec<u16>) -> *mut u16
- 1.43Assembling
std::u16string
from*mut u16
- 1.47Adding
free_locale
- 1.58Adding
String::from_utf16(&[u16])
incanonicalize_utf16_locale
- 1.85Adding
LanguageIdentifier
parsing fromString
incanonicalize_utf16_locale
- 2.07Adding
U16CString::from_vec_unchecked(Vec<u16>).into_raw() -> *mut u16
incanonicalize_utf16_locale
- 2.30