Skip to content

Instantly share code, notes, and snippets.

@vtjnash
Created June 20, 2018 21:05
Show Gist options
  • Save vtjnash/12729675f3143a8336d5339ae072c739 to your computer and use it in GitHub Desktop.
Save vtjnash/12729675f3143a8336d5339ae072c739 to your computer and use it in GitHub Desktop.
Patch to PCRE2 to handle ill-formed UTF8 data (assuming it is safe to skip the length check)
diff -prU12 pcre2-10.30-release/src/pcre2_internal.h pcre2-10.30/src/pcre2_internal.h
--- pcre2-10.30-release/src/pcre2_internal.h 2017-07-19 12:00:20.000000000 -0400
+++ pcre2-10.30/src/pcre2_internal.h 2018-06-20 17:03:09.000000000 -0400
@@ -271,103 +271,157 @@ is not supported. */
/* The following macros were originally written in the form of loops that used
data from the tables whose names start with PRIV(utf8_table). They were
rewritten by a user so as not to use loops, because in some environments this
gives a significant performance advantage, and it seems never to do any harm.
*/
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer. */
#define GETUTF8(c, eptr) \
{ \
- if ((c & 0x20u) == 0) \
+ if ((eptr[1] & 0xc0) != 0x80) \
+ c = 0xFFFD; \
+ else if ((c & 0x20u) == 0) \
c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \
+ else if ((eptr[2] & 0xc0) != 0x80) \
+ c = 0xFFFD; \
else if ((c & 0x10u) == 0) \
c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
+ else if ((eptr[3] & 0xc0) != 0x80) \
+ c = 0xFFFD; \
else if ((c & 0x08u) == 0) \
c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \
((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \
+ else if ((eptr[4] & 0xc0) != 0x80) \
+ c = 0xFFFD; \
else if ((c & 0x04u) == 0) \
c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \
((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \
(eptr[4] & 0x3fu); \
+ else if ((eptr[5] & 0xc0) != 0x80) \
+ c = 0xFFFD; \
else \
c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \
((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \
((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \
}
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
the pointer. */
#define GETUTF8INC(c, eptr) \
{ \
- if ((c & 0x20u) == 0) \
+ if ((*eptr & 0xc0) != 0x80) \
+ c = 0xFFFD; \
+ else if ((c & 0x20u) == 0) \
c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
+ else if ((eptr[1] & 0xc0) != 0x80) \
+ { \
+ c = 0xFFFD; \
+ eptr += 1; \
+ } \
else if ((c & 0x10u) == 0) \
{ \
c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
eptr += 2; \
} \
+ else if ((eptr[2] & 0xc0) != 0x80) \
+ { \
+ c = 0xFFFD; \
+ eptr += 2; \
+ } \
else if ((c & 0x08u) == 0) \
{ \
c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
eptr += 3; \
} \
+ else if ((eptr[3] & 0xc0) != 0x80) \
+ { \
+ c = 0xFFFD; \
+ eptr += 3; \
+ } \
else if ((c & 0x04u) == 0) \
{ \
c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
(eptr[3] & 0x3fu); \
eptr += 4; \
} \
+ else if ((eptr[4] & 0xc0) != 0x80) \
+ { \
+ c = 0xFFFD; \
+ eptr += 4; \
+ } \
else \
{ \
c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
eptr += 5; \
} \
}
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer, incrementing the length. */
#define GETUTF8LEN(c, eptr, len) \
{ \
- if ((c & 0x20u) == 0) \
+ if ((eptr[1] & 0xc0) != 0x80) \
+ c = 0xFFFD; \
+ else if ((c & 0x20u) == 0) \
{ \
c = ((c & 0x1fu) << 6) | (eptr[1] & 0x3fu); \
len++; \
} \
+ else if ((eptr[2] & 0xc0) != 0x80) \
+ { \
+ c = 0xFFFD; \
+ len++; \
+ } \
else if ((c & 0x10u) == 0) \
{ \
c = ((c & 0x0fu) << 12) | ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
len += 2; \
} \
+ else if ((eptr[3] & 0xc0) != 0x80) \
+ { \
+ c = 0xFFFD; \
+ len += 2; \
+ } \
else if ((c & 0x08u) == 0) \
{\
c = ((c & 0x07u) << 18) | ((eptr[1] & 0x3fu) << 12) | \
((eptr[2] & 0x3fu) << 6) | (eptr[3] & 0x3fu); \
len += 3; \
} \
+ else if ((eptr[4] & 0xc0) != 0x80) \
+ { \
+ c = 0xFFFD; \
+ len += 3; \
+ } \
else if ((c & 0x04u) == 0) \
{ \
c = ((c & 0x03u) << 24) | ((eptr[1] & 0x3fu) << 18) | \
((eptr[2] & 0x3fu) << 12) | ((eptr[3] & 0x3fu) << 6) | \
(eptr[4] & 0x3fu); \
len += 4; \
} \
+ else if ((eptr[5] & 0xc0) != 0x80) \
+ { \
+ c = 0xFFFD; \
+ len += 4; \
+ } \
else \
{\
c = ((c & 0x01u) << 30) | ((eptr[1] & 0x3fu) << 24) | \
((eptr[2] & 0x3fu) << 18) | ((eptr[3] & 0x3fu) << 12) | \
((eptr[4] & 0x3fu) << 6) | (eptr[5] & 0x3fu); \
len += 5; \
} \
}
/* --------------- Whitespace macros ---------------- */
/* Tests for Unicode horizontal and vertical whitespace characters must check a
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment