Last active
August 7, 2022 10:49
-
-
Save snippins/2d9a66e6708bc258f662babe0a1c1009 to your computer and use it in GitHub Desktop.
Emacs 29 patch for regex lookaround - rev b9c65203d0f419306ac062e59a59643db9a1a541 - 2022-08-06 master
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/regex-emacs.c b/src/regex-emacs.c | |
index 9b2c14c..015451d 100644 | |
--- a/src/regex-emacs.c | |
+++ b/src/regex-emacs.c | |
@@ -43,6 +43,8 @@ | |
# undef RE_DUP_MAX | |
#endif | |
#define RE_DUP_MAX (0xffff) | |
+#define PREV_CHAR_BOUNDARY(p, limit) ((p)--) | |
+#define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p)) | |
/* Make syntax table lookup grant data in gl_state. */ | |
#define SYNTAX(c) syntax_property (c, 1) | |
@@ -321,7 +323,16 @@ #define BYTEWIDTH 8 /* In bits. */ | |
/* Matches any character whose syntax is not that specified. */ | |
notsyntaxspec, | |
+ lookahead, | |
+ lookahead_not, | |
+ lookbehind, | |
+ lookbehind_not, | |
+ lookaround_succeed, | |
+ lookaround_fail, | |
+ | |
+ before_dot, /* Succeeds if before point. */ | |
at_dot, /* Succeeds if at point. */ | |
+ after_dot, /* Succeeds if after point. */ | |
/* Matches any character whose category-set contains the specified | |
category. The operator is followed by a byte which contains a | |
@@ -542,6 +553,32 @@ print_partial_compiled_pattern (re_char *start, re_char *end) | |
fprintf (stderr, "/stop_memory/%d", *p++); | |
break; | |
+ case lookahead: | |
+ extract_number_and_incr (&mcnt, &p); | |
+ fprintf (stderr, "/lookahead/%d", mcnt); | |
+ break; | |
+ case lookahead_not: | |
+ extract_number_and_incr (&mcnt, &p); | |
+ fprintf (stderr, "/lookahead_not/%d", mcnt); | |
+ break; | |
+ case lookbehind: | |
+ extract_number_and_incr (&mcnt, &p); | |
+ extract_number_and_incr (&mcnt2, &p); | |
+ fprintf (stderr, "/lookbehind/%d/%d", mcnt, mcnt2); | |
+ break; | |
+ case lookbehind_not: | |
+ extract_number_and_incr (&mcnt, &p); | |
+ extract_number_and_incr (&mcnt2, &p); | |
+ fprintf (stderr, "/lookbehind_not/%d/%d", mcnt, mcnt2); | |
+ break; | |
+ case lookaround_succeed: | |
+ fprintf (stderr, "/lookaround_succeed"); | |
+ break; | |
+ case lookaround_fail: | |
+ fprintf (stderr, "/lookaround_fail"); | |
+ break; | |
+ | |
+ | |
case duplicate: | |
fprintf (stderr, "/duplicate/%d", *p++); | |
break; | |
@@ -706,10 +743,18 @@ print_partial_compiled_pattern (re_char *start, re_char *end) | |
fprintf (stderr, "/%d", mcnt); | |
break; | |
+ case before_dot: | |
+ fprintf (stderr, "/before_dot"); | |
+ break; | |
+ | |
case at_dot: | |
fputs ("/at_dot", stderr); | |
break; | |
+ case after_dot: | |
+ fprintf (stderr, "/after_dot"); | |
+ break; | |
+ | |
case categoryspec: | |
fputs ("/categoryspec", stderr); | |
mcnt = *p++; | |
@@ -1146,6 +1191,30 @@ #define POP_FAILURE_POINT(str, pat) \ | |
DEBUG_STATEMENT (nfailure_points_popped++); \ | |
} while (false) /* POP_FAILURE_POINT */ | |
+#define FINISH_LOOKAROUND() \ | |
+ do { \ | |
+ re_char *str, *pat; \ | |
+ re_opcode_t op; \ | |
+ discard_saved_regs = 1; \ | |
+ while (!FAIL_STACK_EMPTY ()) \ | |
+ { \ | |
+ POP_FAILURE_POINT (str, pat); \ | |
+ op = (re_opcode_t) *pat; \ | |
+ if (op == lookahead \ | |
+ || op == lookahead_not \ | |
+ || op == lookbehind \ | |
+ || op == lookbehind_not) \ | |
+ { \ | |
+ d = str; \ | |
+ dend = ((d >= string1 && d <= end1) \ | |
+ ? end_match_1 : end_match_2); \ | |
+ break; \ | |
+ } \ | |
+ } \ | |
+ discard_saved_regs = 0; \ | |
+ } while (0); | |
+ | |
+ | |
/* Registers are set to a sentinel when they haven't yet matched. */ | |
@@ -1285,6 +1354,7 @@ verify (LONG_MIN <= -(MAX_BUF_SIZE - 1) && MAX_BUF_SIZE - 1 <= LONG_MAX); | |
pattern_offset_t fixup_alt_jump; | |
pattern_offset_t laststart_offset; | |
regnum_t regnum; | |
+ int lookaround; | |
} compile_stack_elt_t; | |
@@ -1656,6 +1726,7 @@ extend_range_table_work_area (struct range_table_work_area *work_area) | |
/* regex_compile and helpers. */ | |
static bool group_in_compile_stack (compile_stack_type, regnum_t); | |
+static int exact_chars_in_pattern_buffer (struct re_pattern_buffer *bufp, re_char *p, re_char *pend); | |
/* Insert the 'jump' from the end of last alternative to "here". | |
The space for the jump has already been allocated. */ | |
@@ -2188,6 +2259,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
case '(': | |
{ | |
bool shy = false; | |
+ int lookaround = 0; | |
regnum_t regnum = 0; | |
if (p+1 < pend) | |
{ | |
@@ -2214,6 +2286,28 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
®num)) | |
FREE_STACK_RETURN (REG_ESIZE); | |
break; | |
+ case '=': | |
+ /* Positive lookahead assertion. */ | |
+ shy = lookaround = 1; | |
+ break; | |
+ case '!': | |
+ /* Negative lookahead assertion. */ | |
+ shy = lookaround = 2; | |
+ break; | |
+ case '<': | |
+ { | |
+ PATFETCH (c); | |
+ if (c == '=') | |
+ /* Positive lookbehind assertion. */ | |
+ shy = lookaround = -1; | |
+ else if (c == '!') | |
+ /* Negative lookbehind assertion. */ | |
+ shy = lookaround = -2; | |
+ else | |
+ FREE_STACK_RETURN (REG_BADPAT); | |
+ } | |
+ break; | |
+ | |
default: | |
/* Only (?:...) is supported right now. */ | |
FREE_STACK_RETURN (REG_BADPAT); | |
@@ -2256,6 +2350,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
= fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; | |
COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; | |
COMPILE_STACK_TOP.regnum = regnum; | |
+ COMPILE_STACK_TOP.lookaround = lookaround; | |
/* Do not push a start_memory for groups beyond the last one | |
we can represent in the compiled pattern. */ | |
@@ -2292,6 +2387,7 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
later groups should continue to be numbered higher, | |
as in '(ab)c(de)' -- the second group is #2. */ | |
regnum_t regnum; | |
+ int lookaround; | |
compile_stack.avail--; | |
begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; | |
@@ -2304,13 +2400,40 @@ regex_compile (re_char *pattern, ptrdiff_t size, | |
/* If we've reached MAX_REGNUM groups, then this open | |
won't actually generate any code, so we'll have to | |
clear pending_exact explicitly. */ | |
+ lookaround = COMPILE_STACK_TOP.lookaround; | |
pending_exact = 0; | |
/* We're at the end of the group, so now we know how many | |
groups were inside this one. */ | |
if (regnum <= MAX_REGNUM && regnum > 0) | |
BUF_PUSH_2 (stop_memory, regnum); | |
- } | |
+ else if (lookaround) | |
+ { | |
+ if (lookaround > 0) | |
+ { | |
+ /* Positive/negative lookahead assertion. */ | |
+ GET_BUFFER_SPACE (3); | |
+ INSERT_JUMP (lookaround == 1 ? lookahead : lookahead_not, laststart, b + 4); | |
+ b += 3; | |
+ } | |
+ else | |
+ { | |
+ /* Positive/negative lookbehind assertion. */ | |
+ int count = exact_chars_in_pattern_buffer (bufp, laststart, b); | |
+ if (count == -1) /* variable length */ | |
+ FREE_STACK_RETURN (REG_BADPAT); | |
+ | |
+ GET_BUFFER_SPACE (5); | |
+ INSERT_JUMP2 (lookaround == -1 ? lookbehind : lookbehind_not, laststart, b + 6, count); | |
+ b += 5; | |
+ } | |
+ | |
+ /* Negative form. */ | |
+ if (lookaround > 1 || lookaround < -1) | |
+ BUF_PUSH (lookaround_fail); | |
+ BUF_PUSH (lookaround_succeed); | |
+ } | |
+ } | |
break; | |
@@ -2995,11 +3118,19 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte) | |
match_any_multibyte_characters = true; | |
} | |
break; | |
+ case lookahead: | |
+ case lookahead_not: | |
+ case lookbehind: | |
+ case lookbehind_not: | |
+ if (!fastmap) break; | |
+ return -1; | |
/* All cases after this match the empty string. These end with | |
'continue'. */ | |
+ case before_dot: | |
case at_dot: | |
+ case after_dot: | |
case no_op: | |
case begline: | |
case endline: | |
@@ -3573,6 +3704,96 @@ skip_noops (re_char *p, re_char *pend) | |
return p; | |
} | |
+ | |
+static int | |
+exact_chars_in_pattern_buffer (bufp, p, pend) | |
+ struct re_pattern_buffer *bufp; | |
+ re_char *p, *pend; | |
+{ | |
+ int count = 0; | |
+ while (p < pend) | |
+ { | |
+ switch (*p++) | |
+ { | |
+ case exactn: | |
+ { | |
+ int mcnt = *p++; | |
+ int buf_charlen; | |
+ while (mcnt > 0) { | |
+ STRING_CHAR_AND_LENGTH (p, buf_charlen); | |
+ p += buf_charlen; | |
+ mcnt -= buf_charlen; | |
+ count++; | |
+ } | |
+ } | |
+ break; | |
+ case start_memory: | |
+ case stop_memory: | |
+ p++; | |
+ break; | |
+#ifdef emacs | |
+ case categoryspec: | |
+ case notcategoryspec: | |
+#endif /* emacs */ | |
+ case syntaxspec: | |
+ case notsyntaxspec: | |
+ p++; | |
+ case anychar: | |
+ count++; | |
+ break; | |
+ | |
+ case charset: | |
+ case charset_not: | |
+ if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1)) | |
+ { | |
+ int mcnt; | |
+ p = CHARSET_RANGE_TABLE (p - 1); | |
+ EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
+ p = CHARSET_RANGE_TABLE_END (p, mcnt); | |
+ } | |
+ else | |
+ p += 1 + CHARSET_BITMAP_SIZE (p - 1); | |
+ count++; | |
+ break; | |
+ | |
+#ifdef emacs | |
+ case before_dot: | |
+ case at_dot: | |
+ case after_dot: | |
+#endif /* emacs */ | |
+ case no_op: | |
+ case begline: | |
+ case endline: | |
+ case begbuf: | |
+ case endbuf: | |
+ case wordbound: | |
+ case notwordbound: | |
+ case wordbeg: | |
+ case wordend: | |
+ case symbeg: | |
+ case symend: | |
+ /* Zero width. */ | |
+ continue; | |
+ case lookahead: | |
+ case lookahead_not: | |
+ case lookbehind: | |
+ case lookbehind_not: | |
+ /* Skip to lookaround_success. */ | |
+ while (p < pend) | |
+ { | |
+ if ((re_opcode_t) *p++ == lookaround_succeed) | |
+ break; | |
+ } | |
+ break; | |
+ default: | |
+ return -1; | |
+ } | |
+ } | |
+ return count; | |
+} | |
+ | |
+ | |
+ | |
/* Test if C matches charset op. *PP points to the charset or charset_not | |
opcode. When the function finishes, *PP will be advanced past that opcode. | |
C is character to test (possibly after translations) and CORIG is original | |
@@ -3944,6 +4165,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |
bool best_regs_set = false; | |
re_char **best_regstart UNINIT, **best_regend UNINIT; | |
+ /* Discard a saved register from the stack. */ | |
+ bool discard_saved_regs = 0; | |
+ | |
/* Logically, this is 'best_regend[0]'. But we don't want to have to | |
allocate space for that if we're not allocating space for anything | |
else (see below). Also, we never need info about register 0 for | |
@@ -4438,6 +4662,77 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |
p += 1; | |
break; | |
+ case lookahead: | |
+ case lookahead_not: | |
+ DEBUG_PRINT ((re_opcode_t) *(p - 1) == lookahead ? "EXECUTING lookahead.\n" : "EXECUTING lookahead_not.\n"); | |
+ | |
+ p += 2; | |
+ PUSH_FAILURE_POINT (p - 3, d); | |
+ break; | |
+ | |
+ case lookbehind: | |
+ case lookbehind_not: | |
+ { | |
+ int mcnt, count_local; | |
+ bool not = (re_opcode_t) *(p - 1) != lookbehind; | |
+ | |
+ EXTRACT_NUMBER_AND_INCR (mcnt, p); | |
+ EXTRACT_NUMBER_AND_INCR (count_local, p); | |
+ | |
+ DEBUG_PRINT (not | |
+ ? "EXECUTING lookbehind_not %d.\n" | |
+ : "EXECUTING lookbehind %d.\n", count); | |
+ | |
+ dfail = d; | |
+ while (d != string1 && count_local > 0) | |
+ { | |
+ if (d == string2) | |
+ { | |
+ if (!string1) | |
+ break; | |
+ d = end1; | |
+ dend = end_match_1; | |
+ } | |
+ | |
+ if (target_multibyte) | |
+ { | |
+ re_char *dhead = (d >= string1 && d <= end1) ? string1 : string2; | |
+ PREV_CHAR_BOUNDARY (d, dhead); | |
+ } | |
+ else | |
+ d--; | |
+ count_local--; | |
+ } | |
+ | |
+ if (count_local > 0) | |
+ { | |
+ if (not) | |
+ { | |
+ /* There is no enough string to match. | |
+ So just make it succeeded here. */ | |
+ d = dfail; | |
+ p = p - 2 + mcnt; | |
+ break; | |
+ } | |
+ else | |
+ goto fail; | |
+ } | |
+ | |
+ PUSH_FAILURE_POINT (p - 5, dfail); | |
+ } | |
+ break; | |
+ | |
+ case lookaround_succeed: | |
+ DEBUG_PRINT ("EXECUTING lookaround_succeed.\n"); | |
+ | |
+ FINISH_LOOKAROUND(); | |
+ break; | |
+ | |
+ case lookaround_fail: | |
+ DEBUG_PRINT ("EXECUTING lookaround_fail.\n"); | |
+ | |
+ FINISH_LOOKAROUND(); | |
+ goto fail; | |
/* \<digit> has been turned into a 'duplicate' command which is | |
followed by the numeric value of <digit> as the register number. */ | |
@@ -5003,12 +5298,24 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |
} | |
break; | |
+ case before_dot: | |
+ DEBUG_PRINT ("EXECUTING before_dot.\n"); | |
+ if (PTR_BYTE_POS (d) >= PT_BYTE) | |
+ goto fail; | |
+ break; | |
+ | |
case at_dot: | |
DEBUG_PRINT ("EXECUTING at_dot.\n"); | |
if (PTR_BYTE_POS (d) != PT_BYTE) | |
goto fail; | |
break; | |
+ case after_dot: | |
+ DEBUG_PRINT ("EXECUTING after_dot.\n"); | |
+ if (PTR_BYTE_POS (d) <= PT_BYTE) | |
+ goto fail; | |
+ break; | |
+ | |
case categoryspec: | |
case notcategoryspec: | |
{ | |
@@ -5058,12 +5365,16 @@ re_match_2_internal (struct re_pattern_buffer *bufp, | |
case on_failure_jump_loop: | |
case on_failure_jump: | |
case succeed_n: | |
+ case lookahead_not: | |
+ case lookbehind_not: | |
d = str; | |
continue_failure_jump: | |
EXTRACT_NUMBER_AND_INCR (mcnt, pat); | |
p = pat + mcnt; | |
break; | |
+ case lookahead: | |
+ case lookbehind: | |
case no_op: | |
/* A special frame used for nastyloops. */ | |
goto fail; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Adapted for Emacs 29 from this:
https://lists.gnu.org/archive/html/emacs-devel/2009-06/msg00094.html
Spec:
[x] (?<=[0-9]+)MB
[o] (?<=[0-9][0-9][0-9][0-9])MB
(re-search-backward "(?<=a)b") for buffer "abca_|_b"
will seek to first "ab".