Last active
July 13, 2021 18:28
-
-
Save rygorous/1d3c1614f9fab50149502e3339ebde83 to your computer and use it in GitHub Desktop.
BiDi processing excerpt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Rule N0: | |
// "Any number of characters that had original bidirectional character type NSM prior to | |
// the application of W1 that immediately follow a paired bracket which changed to L or R | |
// under N0 should change to match the type of their preceding bracket." | |
// | |
// We don't store the character types as they were before W1 persistently. However, we do | |
// store the initial character types before any of the rules get applied (since they're | |
// required for processing of rules L1/L2). Now the only relevant way that a NSM type | |
// can get modified prior to rule W1 is due to directional overrides. But this can't | |
// affect us: if there's directional overrides active around the bracket pair, e.g. | |
// | |
// RLO ( a ) NSM | |
// | |
// then the character types for the brackets themselves would get overwritten and the | |
// PBA wouldn't trigger. This also goes for any positioning of an explicit override that | |
// affects one bracket but not the other. The only remaining possible position for an | |
// override that might cause an initial NSM to get overridden is something like this: | |
// | |
// ( a ) RLO NSM | |
// | |
// But an override in this position breaks the active level run; thus, the paired brackets | |
// and the following NSM wouldn't be part of the same isolating run sequence, so again | |
// we shouldn't trigger in that case. (To be even more specific: an overflow override | |
// wouldn't break the level run, but it also wouldn't enable directional override | |
// behavior in the first place, so we're still good.) | |
// | |
// Long story short: it's OK to use the flags derived from initial types since whenever | |
// an initial NSM got overwritten prior to W1, this path won't run. | |
static void ihud_bidi_resolve_paired_bracket(ihud_bidi_info *b, ihud_bidi_index start, U8 resolved_type, ihud_bidi_index irs_end) | |
{ | |
ihud_bidi_index i; | |
b[start].type = resolved_type; | |
// Resolve a run of (NSM|BN)+ following the bracket | |
for (i = ihud_bidi_next(b, start); i < irs_end && (b[i].flags & (FLG_NSM | FLG_X9R)); i = ihud_bidi_next(b, i)) | |
b[i].type = resolved_type; | |
} | |
static void ihud_bidi_resolve_weak_types_and_paired_brackets(ihud_bidi_info *b, const U32 *text, ihud_bidi_irs *irs) | |
{ | |
// Our implementation of the Paired Bracket Algorithm needs to keep track of two things | |
// per-character: | |
// - Whether the most recent strong type output by the weak types phase was | |
// classified L or R (for the purposes of the PBA, EN and AN count as R). | |
// - Whether the current run (since the last opening/closing bracket) contains | |
// a strong type matching the embedding direction, a strong type opposite the | |
// embedding direction, or neither (neutral). | |
// We turn this into a single state machine. Since this depends on the current | |
// embedding direction, we make it part of the state too. | |
#define STATE_ID(prev_strong,embed_dir,run_type) ((prev_strong) + (embed_dir)*2 + (run_type)*4) | |
#define GET_PREV_STRONG(state_id) ((state_id)&1) | |
#define GET_EMBED_DIR(state_id) (((state_id)>>1)&1) | |
#define GET_RUN_TYPE(state_id) ((state_id)>>2) | |
static const U8 pba_state_table[2*2*4][8] = { // [cur_state][type] | |
// There's only three possible canonical inputs (L, R, N); EN and AN count as R. | |
// Also, AL and ES/ET are never produced as output of the weak types phase. | |
// We only including types 6 and 7 to make table rows contain a pow2 number | |
// of elements. | |
// Inputs (in order): L, R, (unused), NI, EN, AN, (unused), (unused) | |
#define ROW(next_L,next_R,next_N) { (next_L), (next_R), (next_N), (next_N), (next_R), (next_R), (next_N), (next_N) } | |
#define STATE(cur_strong,embed_dir,cur_run_type) ROW( \ | |
STATE_ID(BIDI_L, embed_dir, (embed_dir==BIDI_L || cur_run_type==BIDI_NI) ? BIDI_L : cur_run_type), \ | |
STATE_ID(BIDI_R, embed_dir, (embed_dir==BIDI_R || cur_run_type==BIDI_NI) ? BIDI_R : cur_run_type), \ | |
STATE_ID(cur_strong, embed_dir, cur_run_type) \ | |
) | |
// prev_strong embed_dir run_type | |
STATE(BIDI_L, BIDI_L, BIDI_L), | |
STATE(BIDI_R, BIDI_L, BIDI_L), | |
STATE(BIDI_L, BIDI_R, BIDI_L), | |
STATE(BIDI_R, BIDI_R, BIDI_L), | |
STATE(BIDI_L, BIDI_L, BIDI_R), | |
STATE(BIDI_R, BIDI_L, BIDI_R), | |
STATE(BIDI_L, BIDI_R, BIDI_R), | |
STATE(BIDI_R, BIDI_R, BIDI_R), | |
STATE(BIDI_L, BIDI_L, BIDI_NI), // (unused) | |
STATE(BIDI_R, BIDI_L, BIDI_NI), // (unused) | |
STATE(BIDI_L, BIDI_R, BIDI_NI), // (unused) | |
STATE(BIDI_R, BIDI_R, BIDI_NI), // (unused) | |
STATE(BIDI_L, BIDI_L, BIDI_NI), | |
STATE(BIDI_R, BIDI_L, BIDI_NI), | |
STATE(BIDI_L, BIDI_R, BIDI_NI), | |
STATE(BIDI_R, BIDI_R, BIDI_NI), | |
#undef STATE | |
#undef ROW | |
}; | |
ihud_bidi_pb pb_stack[BIDI_PBA_MAX_DEPTH + 1]; | |
ihud_bidi_index pb_stack_ind = 0; | |
ihud_bidi_index deferred_resolve_start = MAX_INDEX; | |
U8 deferred_resolve_type = 0; // just so it's initialized | |
ihud_bidi_index i, mark_i; | |
U8 st, o; | |
U8 embed_strong_type, pba_state; | |
ihud_bidi_index weak_tos; | |
RR_ASSERT(irs->sos == BIDI_L || irs->sos == BIDI_R); | |
pba_state = STATE_ID(irs->sos, BIDI_L + (irs->level & 1), BIDI_NI); | |
embed_strong_type = pba_state; | |
st = ihud_bidi_weak_state_machine[0].next[irs->sos]; | |
mark_i = irs->first_ind; // just make sure it's initialized | |
weak_tos = MAX_INDEX; // start with clear weak bracket stack | |
for (i = irs->first_ind; i < irs->end_ind; i = ihud_bidi_next(b, i)) { | |
U8 orig_t = b[i].type; // initial type (used by Paired Bracket Algorithm) | |
U8 t = orig_t; | |
// Characters removed by rule X9 should not affect the weak state machine or the | |
// paired bracket algorithm, and are treated as neutrals by later stages. | |
if (t < BIDI_BN) { | |
t = (t < BIDI_LRI) ? t : BIDI_NI; // Pre-canonicalization: fold all Neutral/Isolating chars to BIDI_NI | |
// The entire "weak types" phase can be performed by a finite-state transducer | |
// (more precisely, a Moore machine) described by the table above (which is | |
// generated from an algorithmic description of the various weak type rules | |
// by tblgen.cpp). | |
// | |
// The transducer is basically a finite-state automaton that first performs | |
// a state transition based on the current input symbol, then outputs a result | |
// type based on the current state (after the transition). | |
// | |
// For most of the weak types, it should be straightforward to see how the | |
// rules map into this framework. The tricky part are rules W4 and W5, which | |
// (as described in the spec) require lookahead to decide which rule matches. | |
// | |
// Luckily, all cases where the lookahead matters result in a run of identical | |
// types. So we do something slightly different: in all look-ahead cases, the | |
// primary state machine generates an output symbol assuming that the rule | |
// will not match, but remembers the current position (this is "mark_i"). | |
// If the following characters then mean that the look-ahead rule for W4/W5 | |
// would have matched after all, we update all the output types between | |
// mark_i and the current cursor to their new value (which is the output of | |
// that state). It turns out that this formalism is sufficiently expressive | |
// for the weak types phase. | |
// Perform transition and handle output | |
st = ihud_bidi_weak_state_machine[st].next[t]; | |
o = ihud_bidi_weak_state_machine[st].output; | |
t = o & 0x3f; | |
pba_state = pba_state_table[pba_state][t]; | |
if (o & 0xc0) { // is there a mark action queued? | |
if ((o & 0xc0) == 0x40) // set mark | |
mark_i = i; | |
else { // resolve run from mark to current cursor | |
for (; mark_i < i; mark_i = ihud_bidi_next(b, mark_i)) | |
b[mark_i].type = t; | |
} | |
} | |
} else | |
t = BIDI_XX; // tag deleted chars | |
b[i].type = t; | |
RR_ASSERT(t <= BIDI_AN); | |
// Paired bracket handling (rule N0) | |
// | |
// To understand how this works, let's first consider a simplified version. The actual paired | |
// bracket algorithm resolves brackets outside-in (ordered by their opening bracket position), | |
// but let's pretend for a minute that it resolves the brackets inside-out (ordered by their | |
// closing bracket position). In that setting, it is very natural to process characters in | |
// logical (memory) order, keep a stack of open brackets, and decide whether bracket pairs | |
// are L, R, or truly neutral at the time the closing bracket is encountered (i.e. when it's | |
// popped off the stack). | |
// | |
// Now between any pair of brackets, we want to know which (if any) strong types occur anywhere | |
// between them. The possible answers are, ordered by increasing priority: | |
// 1. There is no strong type anywhere between the bracket pair. | |
// 2. There are strong types opposite the current embedding direction ("o" in the spec), but none | |
// matching the embedding direction ("e" in the spec). | |
// 3. There are strong types matching the embedding direction. | |
// | |
// This is what the code calls the "run type". The run type can be one of L, R, or NI (standing | |
// for neutral). Whether L "beats" R or vice versa depends on the embedding direction. | |
// | |
// To know the final run type for all characters inside a bracket pair, we track two separate | |
// things: | |
// 1. The run type of all characters we've processed since the last bracket character. Any | |
// bracket character, open or closed brackets both. This is the run type inside "pba_state" | |
// and we reset it to NI (neutral) after every bracket character we encounter. | |
// 2. The run type for *all* characters we've seen since the opening bracket at the current top | |
// of the bracket stack. This is in "embed_strong_type". This includes all runs we've | |
// processed since then, including runs in nested brackets that we've since popped off the | |
// open bracket stack. | |
// | |
// pba_state is updated on a run-by-run basis; embed_strong_type accumulates information both from | |
// runs at the current level, and "inherits" information from deeper-nested levels. This takes care | |
// of rule N0.a: "Inspect the bidirectional types of the characters enclosed within the bracket pair". | |
// | |
// What happens next depends on the resulting embed_strong_type. If it's neutral, we're in case | |
// N0.d: the brackets stay neutral, nothing to do. If it's a strong type matching the embedding | |
// direction, we're in case N0.b: resolve the brackets to match the embedding direction. Also | |
// straightforward. So far, we're making these decisions purely based on what character types were | |
// within the bracket pair after the weak types phase, and nothing depends on the order we resolve | |
// brackets in. So even though we're processing inside-out instead of outside-in, for the cases | |
// discussed so far, we're doing the right thing. | |
// | |
// Rule N0.c is what makes things tricky; in particular, the "test for an established context with | |
// a preceding strong type" bit. Now, the "preceding strong type" can, for the most part, be | |
// answered by keeping track of the last strong type we've seen - which we also do in "pba_state" | |
// (the "prev strong" part). But there's a catch. Suppose we're looking at a case like this, inside | |
// a run with R embedding direction, and we're at the marked position (the *): | |
// | |
// L ( ( L )* R ) | |
// | |
// We have a bracket with contents opposite the embedding direction, and the preceding strong type | |
// (outside both bracket pairs) matches - check and check. So we want to resolve the inner bracket | |
// pair to R, and that's what we do. | |
// | |
// But it turns out that, if we were to (correctly!) resolve the brackets from the outside in, we'd | |
// notice that the *outer* bracket pair contains a R-type character, and will thus cause both outer | |
// brackets to get resolved to R - which occurs after the L that we used to establish preceding | |
// context for the inner bracket. So once we close the outer (containing) bracket, we realize our | |
// mistake: the inner bracket pair should have been R after all. | |
// | |
// Luckily, resolving a bracket incorrect to the opposite of the embedding direction when it should | |
// have been equal to the embedding direction is the only such mistake we can make. If a bracket | |
// pair was deemed to have neutral contents (N0.d), it will always stay neutral. If we resolved it | |
// to the embedding direction, it will never get updated again either. | |
// | |
// So we can salvage this: if we're in a case where the resolved direction of a containing bracket | |
// (that we haven't resolved yet) would cause our resolved-to-opposite-the-embedding-direction | |
// bracket to flip sense (because such a bracket would establish a different preceding strong type), | |
// we need to be careful. This is the case when | |
// a) we're about to resolve a bracket to opposite the embedding direction, | |
// b) that bracket is nested inside at least one currently unresolved bracket, | |
// c) there were no strong-type characters between the containing and the nested openings brackets | |
// (which we can infer from pba_state at the time), meaning that whatever strong type the outer | |
// bracket gets resolved to will "cascade through" and establish a preceding strong type for the | |
// inner bracket. | |
// | |
// We refer to such bracket pairs as "weak brackets", and we keep a stack of them (using the existing | |
// link[] fields also used to connect pieces of IRSes; we only link ON-type characters here whereas | |
// the IRS links are on isolate initiators and BNs, so there is no overlap). | |
// | |
// Whenever an outer containing bracket gets resolved to a strong type matching the embedding dir, | |
// we pop all contained weak brackets and re-resolve them to match the embedding dir as well. If | |
// we pop all the way down to have no remaining open brackets, we can clear the stack: with no more | |
// prior positions (namely, positions of yet-unmatched open brackets) that might change their type | |
// and establish a different preceding strong type than before, all decisions made when the weak | |
// brackets were initially pushed on their stack are now final. | |
if (orig_t >= BIDI_ONpba_first && orig_t <= BIDI_ONpba_last && pb_stack_ind <= BIDI_PBA_MAX_DEPTH) { | |
U32 cp = text[i]; // current code point | |
// Normalize: as of Unicode 9.0.0, there is exactly one canonically equivalent bracket pair. | |
// (This special case is hardcoded in both ICU and the C BiDi ref impl brrule.c, so I don't feel | |
// guilty about handling it as a hardcoded special case here.) | |
if (cp >= 0x3008 && cp <= 0x3009) | |
cp = (cp - 0x3008) + 0x2329; | |
// if we have a deferred resolve, now is a good time to do it | |
if (deferred_resolve_start != MAX_INDEX) { | |
ihud_bidi_resolve_paired_bracket(b, deferred_resolve_start, deferred_resolve_type, irs->end_ind); | |
deferred_resolve_start = MAX_INDEX; | |
} | |
if (orig_t == BIDI_ONcpb) { | |
// Close bracket. Try to find matching one on stack. | |
ihud_bidi_index j = pb_stack_ind; | |
while (j-- > 0) { | |
ihud_bidi_index k; | |
ihud_bidi_pb *pbs = &pb_stack[j]; | |
if (cp != pbs->close_cp) | |
continue; | |
// Fold run type since last bracket into embed_strong_type | |
embed_strong_type = pba_state_table[embed_strong_type][GET_RUN_TYPE(pba_state)]; | |
// Pop back down to this level. To do this, we need to fold the previosly saved | |
// embedded strong type resolutions for the inbetween paired brackets we're | |
// closing into the current embed_strong_this_run (since they all count as within | |
// the bracket pair we've identified.) | |
for (k = pb_stack_ind; k > j + 1; --k) | |
embed_strong_type = pba_state_table[embed_strong_type][pb_stack[k - 1].str]; | |
// Rule N0.b/N0.c: if we saw a strong type, resolve bracket type. If the strong type matches | |
// embedding dir, use it. If it doesn't, check preceding context. If both preceding | |
// context and text inside bracket agree, may use strong type disagreeing with | |
// embedding direction. | |
if (GET_RUN_TYPE(embed_strong_type) != BIDI_NI) { | |
// The embed strong type right now can either match the embedding direction, or be | |
// opposite it. If it matches, this computation simply returns the embedding | |
// direction (since it "wins" in case of ties). If it is opposite, the result | |
// depends on the saved pbs->pba_state, which contains the preceding strong type | |
// at the time the of the matched opening bracket. | |
// | |
// If the preceding strong type at the time of the matched opening bracket was | |
// equal to the embedding direction, that decides it: the resolved type is always | |
// going to be the embedding direction. If it was opposite, resolve to the | |
// opposite of the embedding direction, but see notes on weak brackets below. | |
U8 resolved_type = GET_RUN_TYPE(pba_state_table[embed_strong_type][GET_PREV_STRONG(pbs->pba_state)]); | |
// Rule N0: handling of NSMs following paired brackets | |
ihud_bidi_resolve_paired_bracket(b, pbs->open_ind, resolved_type, irs->end_ind); | |
// Defer resolving of close bracket: we haven't run the weak state machine for | |
// the following types yet. N0 runs after the weak types phase, so we wait to | |
// finish weak types processing for the characters that follow the close bracket | |
// before we perform the resolve (which might override the result types for | |
// characters that were NSMs). | |
deferred_resolve_start = i; | |
deferred_resolve_type = resolved_type; | |
TRACE("(%u,%u) paired bracket! resolved=%d embedding_dir=%d\n", pbs->open_ind, i, resolved_type, irs->level & 1); | |
// We resolved to a strong type; this establishes context for later brackets. | |
pba_state = STATE_ID(resolved_type, GET_EMBED_DIR(pba_state), BIDI_NI); | |
// Is the resolved type equal or opposite to the current embedding direction? | |
if (resolved_type == GET_EMBED_DIR(pba_state)) { // equal | |
// A containing resolved bracket matching the current embedding direction establishes | |
// context for all contained weak brackets; time to RE-RESOLVE THEM ALL! | |
// Once we've resolved them to the embedding direction, these brackets can never change | |
// type again, so we can discard their weak bracket records. So just keep popping down | |
// the stack. | |
while (weak_tos != pbs->open_weak_tos) { | |
TRACE(" re-resolve weak bracket at %u\n", weak_tos); | |
ihud_bidi_resolve_paired_bracket(b, weak_tos, resolved_type, irs->end_ind); | |
weak_tos = b[weak_tos].link; | |
} | |
} else { // opposite | |
// If the resolved type is opposite the current embedding direction and there are other | |
// brackets deeper on the stack, this is a "weak" decision: we can later find out that | |
// one of the containing brackets is strongly in the embedding direction, which in turn | |
// will retroactively establish context (as per Rule N0c) for the current bracket and make | |
// it flip direction. So if this is a weak decision, record the position of the opening and | |
// closing brackets so we can fix it up later in case we have to. | |
if (j > 0) { // other brackets are deeper on the stack; might have to revise. | |
// If there was no strong type between the containing opening bracket and this one, | |
// we have a weak bracket. | |
if (GET_RUN_TYPE(pbs->pba_state) == BIDI_NI) { | |
TRACE(" push weak bracket pair (%u,%u)\n", pbs->open_ind, i); | |
// Push onto weak bracket stack. | |
// NOTE: safe to use "link" for this since weak brackets only occur on ONs, whereas | |
// skips only occur on isolate initiators or chars removed by X9. | |
b[i].link = pbs->open_ind; | |
b[pbs->open_ind].link = weak_tos; | |
weak_tos = i; | |
} | |
} else { | |
// If the bracket we're closing is the last one on the stack, we can discard all pending | |
// weak bracket records; they cannot change anymore (since there are no more un-closed | |
// open brackets left that could establish prior context post-resolution) | |
weak_tos = MAX_INDEX; | |
TRACE(" discard weak brackets\n"); | |
} | |
} | |
} | |
// Finally, merge strong type status from before this bracket pair into | |
// the current state, now that we actually pop. | |
embed_strong_type = pba_state_table[embed_strong_type][pb_stack[j].str]; | |
TRACE(" post-pop estc=%d\n", GET_RUN_TYPE(embed_strong_type)); | |
pb_stack_ind = j; // pop! | |
break; | |
} | |
} else { | |
// open bracket | |
static const signed char close_offs[4] = { 1,2,3,-1 }; // matches ON_opb* | |
ihud_bidi_pb *pbs = &pb_stack[pb_stack_ind++]; | |
// Fold run type since last bracket into embed_strong_type | |
embed_strong_type = pba_state_table[embed_strong_type][GET_RUN_TYPE(pba_state)]; | |
TRACE("open bracket pos=%u estc=%d\n", i, GET_RUN_TYPE(embed_strong_type)); | |
RR_ASSERT(orig_t >= BIDI_ONopb1 && orig_t <= BIDI_ONopbn1); | |
pbs->open_ind = i; | |
pbs->open_weak_tos = weak_tos; | |
pbs->close_cp = cp + close_offs[orig_t - BIDI_ONopb1]; | |
pbs->pba_state = pba_state; | |
pbs->str = GET_RUN_TYPE(embed_strong_type); // saved type from containing bracket | |
pba_state = STATE_ID(GET_PREV_STRONG(pba_state), GET_EMBED_DIR(pba_state), BIDI_NI); // reset run type to neutral | |
embed_strong_type = pba_state; | |
} | |
} | |
} | |
// NOTE: Rules W4/W5 might be in a lookahead state waiting for a following EN/AN, | |
// which gets resolved once we feed in eos. | |
// | |
// However, the current state machine implementation of these rules assumes there is | |
// no following EN/AN (and outputs an ON type); in case that turns out to be wrong, we | |
// mark the position where we need to start fixing up and override the whole run. | |
// But eos is always L or R, neither of which matches EN/AN, so at this point there's | |
// simply no chance that the tentative decision for ON can get revised. | |
// If we have a pending deferred bracket resolve, handle it | |
if (deferred_resolve_start != MAX_INDEX) | |
ihud_bidi_resolve_paired_bracket(b, deferred_resolve_start, deferred_resolve_type, irs->end_ind); | |
#undef STATE_ID | |
#undef GET_PREV_STRONG | |
#undef GET_EMBED_DIR | |
#undef GET_RUN_TYPE | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment