rygorous · July 13, 2021 18:28
diff --git a/gistfile1.c b/gistfile1.c
 // Rule N0:
 // "Any number of characters that had original bidirectional character type NSM prior to
 // the application of W1 that immediately follow a paired bracket which changed to L or R
 // under N0 should change to match the type of their preceding bracket."
 //
 // We don't store the character types as they were before W1 persistently. However, we do
 // store the initial character types before any of the rules get applied (since they're
 // required for processing of rules L1/L2). Now the only relevant way that a NSM type
 // can get modified prior to rule W1 is due to directional overrides. But this can't
 // affect us: if there's directional overrides active around the bracket pair, e.g.
 //
 //   RLO ( a ) NSM
 //
 // then the character types for the brackets themselves would get overwritten and the
 // PBA wouldn't trigger. This also goes for any positioning of an explicit override that
 // affects one bracket but not the other. The only remaining possible position for an
 // override that might cause an initial NSM to get overridden is something like this:
 //
 //   ( a ) RLO NSM
 //
 // But an override in this position breaks the active level run; thus, the paired brackets
 // and the following NSM wouldn't be part of the same isolating run sequence, so again
 // we shouldn't trigger in that case. (To be even more specific: an overflow override
 // wouldn't break the level run, but it also wouldn't enable directional override
 // behavior in the first place, so we're still good.)
 //
 // Long story short: it's OK to use the flags derived from initial types since whenever
 // an initial NSM got overwritten prior to W1, this path won't run.
 static void ihud_bidi_resolve_paired_bracket(ihud_bidi_info *b, ihud_bidi_index start, U8 resolved_type, ihud_bidi_index irs_end)
 {
   ihud_bidi_index i;
   b[start].type = resolved_type;

   // Resolve a run of (NSM|BN)+ following the bracket
   for (i = ihud_bidi_next(b, start); i < irs_end && (b[i].flags & (FLG_NSM | FLG_X9R)); i = ihud_bidi_next(b, i))
      b[i].type = resolved_type;
 }

 static void ihud_bidi_resolve_weak_types_and_paired_brackets(ihud_bidi_info *b, const U32 *text, ihud_bidi_irs *irs)
 {
   // Our implementation of the Paired Bracket Algorithm needs to keep track of two things
   // per-character:
   // - Whether the most recent strong type output by the weak types phase was
   //   classified L or R (for the purposes of the PBA, EN and AN count as R).
   // - Whether the current run (since the last opening/closing bracket) contains
   //   a strong type matching the embedding direction, a strong type opposite the
   //   embedding direction, or neither (neutral).
   // We turn this into a single state machine. Since this depends on the current
   // embedding direction, we make it part of the state too.
 #define STATE_ID(prev_strong,embed_dir,run_type) ((prev_strong) + (embed_dir)*2 + (run_type)*4)
 #define GET_PREV_STRONG(state_id) ((state_id)&1)
 #define GET_EMBED_DIR(state_id) (((state_id)>>1)&1)
 #define GET_RUN_TYPE(state_id) ((state_id)>>2)
   static const U8 pba_state_table[2*2*4][8] = { // [cur_state][type]
      // There's only three possible canonical inputs (L, R, N); EN and AN count as R.
      // Also, AL and ES/ET are never produced as output of the weak types phase.
      // We only including types 6 and 7 to make table rows contain a pow2 number
      // of elements.
      // Inputs (in order): L, R, (unused), NI, EN, AN, (unused), (unused)
 #define ROW(next_L,next_R,next_N) { (next_L), (next_R), (next_N), (next_N), (next_R), (next_R), (next_N), (next_N) }
 #define STATE(cur_strong,embed_dir,cur_run_type) ROW( \
           STATE_ID(BIDI_L,     embed_dir, (embed_dir==BIDI_L || cur_run_type==BIDI_NI) ? BIDI_L : cur_run_type), \
           STATE_ID(BIDI_R,     embed_dir, (embed_dir==BIDI_R || cur_run_type==BIDI_NI) ? BIDI_R : cur_run_type), \
           STATE_ID(cur_strong, embed_dir, cur_run_type) \
        )
      //    prev_strong  embed_dir  run_type
      STATE(BIDI_L,      BIDI_L,    BIDI_L),
      STATE(BIDI_R,      BIDI_L,    BIDI_L),
      STATE(BIDI_L,      BIDI_R,    BIDI_L),
      STATE(BIDI_R,      BIDI_R,    BIDI_L),
      STATE(BIDI_L,      BIDI_L,    BIDI_R),
      STATE(BIDI_R,      BIDI_L,    BIDI_R),
      STATE(BIDI_L,      BIDI_R,    BIDI_R),
      STATE(BIDI_R,      BIDI_R,    BIDI_R),
      STATE(BIDI_L,      BIDI_L,    BIDI_NI), // (unused)
      STATE(BIDI_R,      BIDI_L,    BIDI_NI), // (unused)
      STATE(BIDI_L,      BIDI_R,    BIDI_NI), // (unused)
      STATE(BIDI_R,      BIDI_R,    BIDI_NI), // (unused)
      STATE(BIDI_L,      BIDI_L,    BIDI_NI),
      STATE(BIDI_R,      BIDI_L,    BIDI_NI),
      STATE(BIDI_L,      BIDI_R,    BIDI_NI),
      STATE(BIDI_R,      BIDI_R,    BIDI_NI),
 #undef STATE
 #undef ROW
   };

   ihud_bidi_pb pb_stack[BIDI_PBA_MAX_DEPTH + 1];
   ihud_bidi_index pb_stack_ind = 0;
   ihud_bidi_index deferred_resolve_start = MAX_INDEX;
   U8 deferred_resolve_type = 0; // just so it's initialized
   ihud_bidi_index i, mark_i;
   U8 st, o;
   U8 embed_strong_type, pba_state;
   ihud_bidi_index weak_tos;

   RR_ASSERT(irs->sos == BIDI_L || irs->sos == BIDI_R);
   pba_state = STATE_ID(irs->sos, BIDI_L + (irs->level & 1), BIDI_NI);
   embed_strong_type = pba_state;
   st = ihud_bidi_weak_state_machine[0].next[irs->sos];
   mark_i = irs->first_ind; // just make sure it's initialized
   weak_tos = MAX_INDEX; // start with clear weak bracket stack

   for (i = irs->first_ind; i < irs->end_ind; i = ihud_bidi_next(b, i)) {
      U8 orig_t = b[i].type; // initial type (used by Paired Bracket Algorithm)
      U8 t = orig_t;

      // Characters removed by rule X9 should not affect the weak state machine or the
      // paired bracket algorithm, and are treated as neutrals by later stages.
      if (t < BIDI_BN) {
         t = (t < BIDI_LRI) ? t : BIDI_NI; // Pre-canonicalization: fold all Neutral/Isolating chars to BIDI_NI

         // The entire "weak types" phase can be performed by a finite-state transducer
         // (more precisely, a Moore machine) described by the table above (which is
         // generated from an algorithmic description of the various weak type rules
         // by tblgen.cpp).
         //
         // The transducer is basically a finite-state automaton that first performs
         // a state transition based on the current input symbol, then outputs a result
         // type based on the current state (after the transition).
         //
         // For most of the weak types, it should be straightforward to see how the
         // rules map into this framework. The tricky part are rules W4 and W5, which
         // (as described in the spec) require lookahead to decide which rule matches.
         //
         // Luckily, all cases where the lookahead matters result in a run of identical
         // types. So we do something slightly different: in all look-ahead cases, the
         // primary state machine generates an output symbol assuming that the rule
         // will not match, but remembers the current position (this is "mark_i").
         // If the following characters then mean that the look-ahead rule for W4/W5
         // would have matched after all, we update all the output types between
         // mark_i and the current cursor to their new value (which is the output of
         // that state). It turns out that this formalism is sufficiently expressive
         // for the weak types phase.

         // Perform transition and handle output
         st = ihud_bidi_weak_state_machine[st].next[t];
         o = ihud_bidi_weak_state_machine[st].output;

         t = o & 0x3f;
         pba_state = pba_state_table[pba_state][t];

         if (o & 0xc0) { // is there a mark action queued?
            if ((o & 0xc0) == 0x40) // set mark
               mark_i = i;
            else { // resolve run from mark to current cursor
               for (; mark_i < i; mark_i = ihud_bidi_next(b, mark_i))
                  b[mark_i].type = t;
            }
         }
      } else
         t = BIDI_XX; // tag deleted chars

      b[i].type = t;
      RR_ASSERT(t <= BIDI_AN);

      // Paired bracket handling (rule N0)
      //
      // To understand how this works, let's first consider a simplified version. The actual paired
      // bracket algorithm resolves brackets outside-in (ordered by their opening bracket position),
      // but let's pretend for a minute that it resolves the brackets inside-out (ordered by their
      // closing bracket position). In that setting, it is very natural to process characters in
      // logical (memory) order, keep a stack of open brackets, and decide whether bracket pairs
      // are L, R, or truly neutral at the time the closing bracket is encountered (i.e. when it's
      // popped off the stack).
      //
      // Now between any pair of brackets, we want to know which (if any) strong types occur anywhere
      // between them. The possible answers are, ordered by increasing priority:
      // 1. There is no strong type anywhere between the bracket pair.
      // 2. There are strong types opposite the current embedding direction ("o" in the spec), but none
      //    matching the embedding direction ("e" in the spec).
      // 3. There are strong types matching the embedding direction.
      //
      // This is what the code calls the "run type". The run type can be one of L, R, or NI (standing
      // for neutral). Whether L "beats" R or vice versa depends on the embedding direction.
      //
      // To know the final run type for all characters inside a bracket pair, we track two separate
      // things:
      // 1. The run type of all characters we've processed since the last bracket character. Any
      //    bracket character, open or closed brackets both. This is the run type inside "pba_state"
      //    and we reset it to NI (neutral) after every bracket character we encounter.
      // 2. The run type for *all* characters we've seen since the opening bracket at the current top
      //    of the bracket stack. This is in "embed_strong_type". This includes all runs we've
      //    processed since then, including runs in nested brackets that we've since popped off the
      //    open bracket stack.
      //
      // pba_state is updated on a run-by-run basis; embed_strong_type accumulates information both from
      // runs at the current level, and "inherits" information from deeper-nested levels. This takes care
      // of rule N0.a: "Inspect the bidirectional types of the characters enclosed within the bracket pair".
      //
      // What happens next depends on the resulting embed_strong_type. If it's neutral, we're in case
      // N0.d: the brackets stay neutral, nothing to do. If it's a strong type matching the embedding
      // direction, we're in case N0.b: resolve the brackets to match the embedding direction. Also
      // straightforward. So far, we're making these decisions purely based on what character types were
      // within the bracket pair after the weak types phase, and nothing depends on the order we resolve
      // brackets in. So even though we're processing inside-out instead of outside-in, for the cases
      // discussed so far, we're doing the right thing.
      //
      // Rule N0.c is what makes things tricky; in particular, the "test for an established context with
      // a preceding strong type" bit. Now, the "preceding strong type" can, for the most part, be
      // answered by keeping track of the last strong type we've seen - which we also do in "pba_state"
      // (the "prev strong" part). But there's a catch. Suppose we're looking at a case like this, inside
      // a run with R embedding direction, and we're at the marked position (the *):
      //
      //   L ( ( L )* R )
      //
      // We have a bracket with contents opposite the embedding direction, and the preceding strong type
      // (outside both bracket pairs) matches - check and check. So we want to resolve the inner bracket
      // pair to R, and that's what we do.
      //
      // But it turns out that, if we were to (correctly!) resolve the brackets from the outside in, we'd
      // notice that the *outer* bracket pair contains a R-type character, and will thus cause both outer
      // brackets to get resolved to R - which occurs after the L that we used to establish preceding
      // context for the inner bracket. So once we close the outer (containing) bracket, we realize our
      // mistake: the inner bracket pair should have been R after all.
      //
      // Luckily, resolving a bracket incorrect to the opposite of the embedding direction when it should
      // have been equal to the embedding direction is the only such mistake we can make. If a bracket
      // pair was deemed to have neutral contents (N0.d), it will always stay neutral. If we resolved it
      // to the embedding direction, it will never get updated again either.
      //
      // So we can salvage this: if we're in a case where the resolved direction of a containing bracket
      // (that we haven't resolved yet) would cause our resolved-to-opposite-the-embedding-direction
      // bracket to flip sense (because such a bracket would establish a different preceding strong type),
      // we need to be careful. This is the case when
      // a) we're about to resolve a bracket to opposite the embedding direction,
      // b) that bracket is nested inside at least one currently unresolved bracket,
      // c) there were no strong-type characters between the containing and the nested openings brackets
      //    (which we can infer from pba_state at the time), meaning that whatever strong type the outer
      //    bracket gets resolved to will "cascade through" and establish a preceding strong type for the
      //    inner bracket.
      //
      // We refer to such bracket pairs as "weak brackets", and we keep a stack of them (using the existing
      // link[] fields also used to connect pieces of IRSes; we only link ON-type characters here whereas
      // the IRS links are on isolate initiators and BNs, so there is no overlap).
      //
      // Whenever an outer containing bracket gets resolved to a strong type matching the embedding dir,
      // we pop all contained weak brackets and re-resolve them to match the embedding dir as well. If
      // we pop all the way down to have no remaining open brackets, we can clear the stack: with no more
      // prior positions (namely, positions of yet-unmatched open brackets) that might change their type
      // and establish a different preceding strong type than before, all decisions made when the weak
      // brackets were initially pushed on their stack are now final.
      if (orig_t >= BIDI_ONpba_first && orig_t <= BIDI_ONpba_last && pb_stack_ind <= BIDI_PBA_MAX_DEPTH) {
         U32 cp = text[i]; // current code point

         // Normalize: as of Unicode 9.0.0, there is exactly one canonically equivalent bracket pair.
         // (This special case is hardcoded in both ICU and the C BiDi ref impl brrule.c, so I don't feel
         // guilty about handling it as a hardcoded special case here.)
         if (cp >= 0x3008 && cp <= 0x3009)
            cp = (cp - 0x3008) + 0x2329;

         // if we have a deferred resolve, now is a good time to do it
         if (deferred_resolve_start != MAX_INDEX) {
            ihud_bidi_resolve_paired_bracket(b, deferred_resolve_start, deferred_resolve_type, irs->end_ind);
            deferred_resolve_start = MAX_INDEX;
         }

         if (orig_t == BIDI_ONcpb) {
            // Close bracket. Try to find matching one on stack.
            ihud_bidi_index j = pb_stack_ind;

            while (j-- > 0) {
               ihud_bidi_index k;
               ihud_bidi_pb *pbs = &pb_stack[j];
               if (cp != pbs->close_cp)
                  continue;

               // Fold run type since last bracket into embed_strong_type
               embed_strong_type = pba_state_table[embed_strong_type][GET_RUN_TYPE(pba_state)];

               // Pop back down to this level. To do this, we need to fold the previosly saved
               // embedded strong type resolutions for the inbetween paired brackets we're
               // closing into the current embed_strong_this_run (since they all count as within
               // the bracket pair we've identified.)
               for (k = pb_stack_ind; k > j + 1; --k)
                  embed_strong_type = pba_state_table[embed_strong_type][pb_stack[k - 1].str];

               // Rule N0.b/N0.c: if we saw a strong type, resolve bracket type. If the strong type matches
               // embedding dir, use it. If it doesn't, check preceding context. If both preceding
               // context and text inside bracket agree, may use strong type disagreeing with
               // embedding direction.
               if (GET_RUN_TYPE(embed_strong_type) != BIDI_NI) {
                  // The embed strong type right now can either match the embedding direction, or be
                  // opposite it. If it matches, this computation simply returns the embedding
                  // direction (since it "wins" in case of ties). If it is opposite, the result
                  // depends on the saved pbs->pba_state, which contains the preceding strong type
                  // at the time the of the matched opening bracket.
                  //
                  // If the preceding strong type at the time of the matched opening bracket was
                  // equal to the embedding direction, that decides it: the resolved type is always
                  // going to be the embedding direction. If it was opposite, resolve to the
                  // opposite of the embedding direction, but see notes on weak brackets below.
                  U8 resolved_type = GET_RUN_TYPE(pba_state_table[embed_strong_type][GET_PREV_STRONG(pbs->pba_state)]);

                  // Rule N0: handling of NSMs following paired brackets
                  ihud_bidi_resolve_paired_bracket(b, pbs->open_ind, resolved_type, irs->end_ind);

                  // Defer resolving of close bracket: we haven't run the weak state machine for
                  // the following types yet. N0 runs after the weak types phase, so we wait to
                  // finish weak types processing for the characters that follow the close bracket
                  // before we perform the resolve (which might override the result types for
                  // characters that were NSMs).
                  deferred_resolve_start = i;
                  deferred_resolve_type = resolved_type;

                  TRACE("(%u,%u) paired bracket! resolved=%d embedding_dir=%d\n", pbs->open_ind, i, resolved_type, irs->level & 1);

                  // We resolved to a strong type; this establishes context for later brackets.
                  pba_state = STATE_ID(resolved_type, GET_EMBED_DIR(pba_state), BIDI_NI);

                  // Is the resolved type equal or opposite to the current embedding direction?
                  if (resolved_type == GET_EMBED_DIR(pba_state)) { // equal
                     // A containing resolved bracket matching the current embedding direction establishes
                     // context for all contained weak brackets; time to RE-RESOLVE THEM ALL!
                     // Once we've resolved them to the embedding direction, these brackets can never change
                     // type again, so we can discard their weak bracket records. So just keep popping down
                     // the stack.
                     while (weak_tos != pbs->open_weak_tos) {
                        TRACE("  re-resolve weak bracket at %u\n", weak_tos);
                        ihud_bidi_resolve_paired_bracket(b, weak_tos, resolved_type, irs->end_ind);
                        weak_tos = b[weak_tos].link;
                     }
                  } else { // opposite
                     // If the resolved type is opposite the current embedding direction and there are other
                     // brackets deeper on the stack, this is a "weak" decision: we can later find out that
                     // one of the containing brackets is strongly in the embedding direction, which in turn
                     // will retroactively establish context (as per Rule N0c) for the current bracket and make
                     // it flip direction. So if this is a weak decision, record the position of the opening and
                     // closing brackets so we can fix it up later in case we have to.
                     if (j > 0) { // other brackets are deeper on the stack; might have to revise.
                        // If there was no strong type between the containing opening bracket and this one,
                        // we have a weak bracket.
                        if (GET_RUN_TYPE(pbs->pba_state) == BIDI_NI) {
                           TRACE("  push weak bracket pair (%u,%u)\n", pbs->open_ind, i);
                           // Push onto weak bracket stack.
                           // NOTE: safe to use "link" for this since weak brackets only occur on ONs, whereas
                           // skips only occur on isolate initiators or chars removed by X9.
                           b[i].link = pbs->open_ind;
                           b[pbs->open_ind].link = weak_tos;
                           weak_tos = i;
                        }
                     } else {
                        // If the bracket we're closing is the last one on the stack, we can discard all pending
                        // weak bracket records; they cannot change anymore (since there are no more un-closed
                        // open brackets left that could establish prior context post-resolution)
                        weak_tos = MAX_INDEX;
                        TRACE("  discard weak brackets\n");
                     }
                  }
               }

               // Finally, merge strong type status from before this bracket pair into
               // the current state, now that we actually pop.
               embed_strong_type = pba_state_table[embed_strong_type][pb_stack[j].str];
               TRACE("  post-pop estc=%d\n", GET_RUN_TYPE(embed_strong_type));
               pb_stack_ind = j; // pop!
               break;
            }
         } else {
            // open bracket
            static const signed char close_offs[4] = { 1,2,3,-1 }; // matches ON_opb*
            ihud_bidi_pb *pbs = &pb_stack[pb_stack_ind++];

            // Fold run type since last bracket into embed_strong_type
            embed_strong_type = pba_state_table[embed_strong_type][GET_RUN_TYPE(pba_state)];
            TRACE("open bracket pos=%u estc=%d\n", i, GET_RUN_TYPE(embed_strong_type));

            RR_ASSERT(orig_t >= BIDI_ONopb1 && orig_t <= BIDI_ONopbn1);
            pbs->open_ind = i;
            pbs->open_weak_tos = weak_tos;
            pbs->close_cp = cp + close_offs[orig_t - BIDI_ONopb1];
            pbs->pba_state = pba_state;
            pbs->str = GET_RUN_TYPE(embed_strong_type); // saved type from containing bracket
            pba_state = STATE_ID(GET_PREV_STRONG(pba_state), GET_EMBED_DIR(pba_state), BIDI_NI); // reset run type to neutral
            embed_strong_type = pba_state;
         }
      }
   }

   // NOTE: Rules W4/W5 might be in a lookahead state waiting for a following EN/AN,
   // which gets resolved once we feed in eos.
   //
   // However, the current state machine implementation of these rules assumes there is
   // no following EN/AN (and outputs an ON type); in case that turns out to be wrong, we
   // mark the position where we need to start fixing up and override the whole run.
   // But eos is always L or R, neither of which matches EN/AN, so at this point there's
   // simply no chance that the tentative decision for ON can get revised.

   // If we have a pending deferred bracket resolve, handle it
   if (deferred_resolve_start != MAX_INDEX)
      ihud_bidi_resolve_paired_bracket(b, deferred_resolve_start, deferred_resolve_type, irs->end_ind);

 #undef STATE_ID
 #undef GET_PREV_STRONG
 #undef GET_EMBED_DIR
 #undef GET_RUN_TYPE
 }