Created
June 20, 2018 21:05
-
-
Save vtjnash/12729675f3143a8336d5339ae072c739 to your computer and use it in GitHub Desktop.
Patch to PCRE2 to handle ill-formed UTF8 data (assuming it is safe to skip the length check)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -prU12 pcre2-10.30-release/src/pcre2_internal.h pcre2-10.30/src/pcre2_internal.h | |
--- pcre2-10.30-release/src/pcre2_internal.h 2017-07-19 12:00:20.000000000 -0400 | |
+++ pcre2-10.30/src/pcre2_internal.h 2018-06-20 17:03:09.000000000 -0400 | |
@@ -271,103 +271,157 @@ is not supported. */ | |
/* The following macros were originally written in the form of loops that used | |
data from the tables whose names start with PRIV(utf8_table). They were | |
rewritten by a user so as not to use loops, because in some environments this | |
gives a significant performance advantage, and it seems never to do any harm. | |
*/ | |
/* Base macro to pick up the remaining bytes of a UTF-8 character, not | |
advancing the pointer. */ | |
#define GETUTF8(c, eptr) \ | |
{ \ | |
- if ((c & 0x20u) == 0) \ | |
+ if ((eptr[1] & 0xc0) != 0x80) \ | |
+ c = 0xFFFD; \ | |
+ else if ((c & 0x20u) == 0) \ | |
c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \ | |
+ else if ((eptr[2] & 0xc0) != 0x80) \ | |
+ c = 0xFFFD; \ | |
else if ((c & 0x10u) == 0) \ | |
c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ | |
+ else if ((eptr[3] & 0xc0) != 0x80) \ | |
+ c = 0xFFFD; \ | |
else if ((c & 0x08u) == 0) \ | |
c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \ | |
((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \ | |
+ else if ((eptr[4] & 0xc0) != 0x80) \ | |
+ c = 0xFFFD; \ | |
else if ((c & 0x04u) == 0) \ | |
c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \ | |
((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \ | |
(eptr[4] & 0x3fu); \ | |
+ else if ((eptr[5] & 0xc0) != 0x80) \ | |
+ c = 0xFFFD; \ | |
else \ | |
c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \ | |
((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \ | |
((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \ | |
} | |
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing | |
the pointer. */ | |
#define GETUTF8INC(c, eptr) \ | |
{ \ | |
- if ((c & 0x20u) == 0) \ | |
+ if ((*eptr & 0xc0) != 0x80) \ | |
+ c = 0xFFFD; \ | |
+ else if ((c & 0x20u) == 0) \ | |
c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \ | |
+ else if ((eptr[1] & 0xc0) != 0x80) \ | |
+ { \ | |
+ c = 0xFFFD; \ | |
+ eptr += 1; \ | |
+ } \ | |
else if ((c & 0x10u) == 0) \ | |
{ \ | |
c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \ | |
eptr += 2; \ | |
} \ | |
+ else if ((eptr[2] & 0xc0) != 0x80) \ | |
+ { \ | |
+ c = 0xFFFD; \ | |
+ eptr += 2; \ | |
+ } \ | |
else if ((c & 0x08u) == 0) \ | |
{ \ | |
c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \ | |
((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ | |
eptr += 3; \ | |
} \ | |
+ else if ((eptr[3] & 0xc0) != 0x80) \ | |
+ { \ | |
+ c = 0xFFFD; \ | |
+ eptr += 3; \ | |
+ } \ | |
else if ((c & 0x04u) == 0) \ | |
{ \ | |
c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \ | |
((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \ | |
(eptr[3] & 0x3fu); \ | |
eptr += 4; \ | |
} \ | |
+ else if ((eptr[4] & 0xc0) != 0x80) \ | |
+ { \ | |
+ c = 0xFFFD; \ | |
+ eptr += 4; \ | |
+ } \ | |
else \ | |
{ \ | |
c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \ | |
((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \ | |
((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \ | |
eptr += 5; \ | |
} \ | |
} | |
/* Base macro to pick up the remaining bytes of a UTF-8 character, not | |
advancing the pointer, incrementing the length. */ | |
#define GETUTF8LEN(c, eptr, len) \ | |
{ \ | |
- if ((c & 0x20u) == 0) \ | |
+ if ((eptr[1] & 0xc0) != 0x80) \ | |
+ c = 0xFFFD; \ | |
+ else if ((c & 0x20u) == 0) \ | |
{ \ | |
c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \ | |
len++; \ | |
} \ | |
+ else if ((eptr[2] & 0xc0) != 0x80) \ | |
+ { \ | |
+ c = 0xFFFD; \ | |
+ len++; \ | |
+ } \ | |
else if ((c & 0x10u) == 0) \ | |
{ \ | |
c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \ | |
len += 2; \ | |
} \ | |
+ else if ((eptr[3] & 0xc0) != 0x80) \ | |
+ { \ | |
+ c = 0xFFFD; \ | |
+ len += 2; \ | |
+ } \ | |
else if ((c & 0x08u) == 0) \ | |
{\ | |
c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \ | |
((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \ | |
len += 3; \ | |
} \ | |
+ else if ((eptr[4] & 0xc0) != 0x80) \ | |
+ { \ | |
+ c = 0xFFFD; \ | |
+ len += 3; \ | |
+ } \ | |
else if ((c & 0x04u) == 0) \ | |
{ \ | |
c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \ | |
((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \ | |
(eptr[4] & 0x3fu); \ | |
len += 4; \ | |
} \ | |
+ else if ((eptr[5] & 0xc0) != 0x80) \ | |
+ { \ | |
+ c = 0xFFFD; \ | |
+ len += 4; \ | |
+ } \ | |
else \ | |
{\ | |
c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \ | |
((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \ | |
((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \ | |
len += 5; \ | |
} \ | |
} | |
/* --------------- Whitespace macros ---------------- */ | |
/* Tests for Unicode horizontal and vertical whitespace characters must check a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment