Skip to content

Instantly share code, notes, and snippets.

@iwamot
Created April 21, 2012 08:22
Show Gist options
  • Save iwamot/2435590 to your computer and use it in GitHub Desktop.
Save iwamot/2435590 to your computer and use it in GitHub Desktop.
--- ext/standard/html.c.org 2007-12-31 16:22:52.000000000 +0900
+++ ext/standard/html.c 2012-03-07 23:33:19.000000000 +0900
@@ -439,19 +439,29 @@
} \
mbseq[mbpos++] = (mbchar); }
+#define CHECK_LEN(pos, chars_need) \
+ if((str_len - (pos)) < chars_need) { \
+ *newpos = pos; \
+ return 0; \
+ }
+
/* {{{ get_next_char
*/
inline static unsigned short get_next_char(enum entity_charset charset,
unsigned char * str,
+ int str_len,
int * newpos,
unsigned char * mbseq,
- int * mbseqlen)
+ int * mbseqlen,
+ int *status)
{
int pos = *newpos;
int mbpos = 0;
int mbspace = *mbseqlen;
unsigned short this_char = str[pos++];
+ *status = SUCCESS;
+
if (mbspace <= 0) {
*mbseqlen = 0;
return this_char;
@@ -463,83 +473,183 @@
case cs_utf_8:
{
unsigned long utf = 0;
- int stat = 0;
- int more = 1;
- /* unpack utf-8 encoding into a wide char.
- * Code stolen from the mbstring extension */
+ if (this_char >= 0xc2 && this_char <= 0xdf) {
+ utf = (this_char & 0x1f) << 6;
+ CHECK_LEN(pos, 1);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
- do {
- if (this_char < 0x80) {
- more = 0;
- break;
- } else if (this_char < 0xc0) {
- switch (stat) {
- case 0x10: /* 2, 2nd */
- case 0x21: /* 3, 3rd */
- case 0x32: /* 4, 4th */
- case 0x43: /* 5, 5th */
- case 0x54: /* 6, 6th */
- /* last byte in sequence */
- more = 0;
- utf |= (this_char & 0x3f);
- this_char = (unsigned short)utf;
- break;
- case 0x20: /* 3, 2nd */
- case 0x31: /* 4, 3rd */
- case 0x42: /* 5, 4th */
- case 0x53: /* 6, 5th */
- /* penultimate char */
- utf |= ((this_char & 0x3f) << 6);
- stat++;
- break;
- case 0x30: /* 4, 2nd */
- case 0x41: /* 5, 3rd */
- case 0x52: /* 6, 4th */
- utf |= ((this_char & 0x3f) << 12);
- stat++;
- break;
- case 0x40: /* 5, 2nd */
- case 0x51:
- utf |= ((this_char & 0x3f) << 18);
- stat++;
- break;
- case 0x50: /* 6, 2nd */
- utf |= ((this_char & 0x3f) << 24);
- stat++;
- break;
- default:
- /* invalid */
- more = 0;
- }
- }
- /* lead byte */
- else if (this_char < 0xe0) {
- stat = 0x10; /* 2 byte */
- utf = (this_char & 0x1f) << 6;
- } else if (this_char < 0xf0) {
- stat = 0x20; /* 3 byte */
- utf = (this_char & 0xf) << 12;
- } else if (this_char < 0xf8) {
- stat = 0x30; /* 4 byte */
- utf = (this_char & 0x7) << 18;
- } else if (this_char < 0xfc) {
- stat = 0x40; /* 5 byte */
- utf = (this_char & 0x3) << 24;
- } else if (this_char < 0xfe) {
- stat = 0x50; /* 6 byte */
- utf = (this_char & 0x1) << 30;
+ } else if (this_char == 0xe0) {
+ utf = (this_char & 0xf) << 12;
+ CHECK_LEN(pos, 2);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0xa0 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
} else {
- /* invalid; bail */
- more = 0;
- break;
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if ((this_char >= 0xe1 && this_char <= 0xec) ||
+ (this_char >= 0xee && this_char <= 0xef)) {
+ utf = (this_char & 0xf) << 12;
+ CHECK_LEN(pos, 2);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (this_char == 0xed) {
+ utf = (this_char & 0xf) << 12;
+ CHECK_LEN(pos, 2);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0xa0 && this_char <= 0x9f) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (this_char == 0xf0) {
+ utf = (this_char & 0x7) << 18;
+ CHECK_LEN(pos, 3);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x90 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 12);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (this_char >= 0xf1 && this_char <= 0xf3) {
+ utf = (this_char & 0x7) << 18;
+ CHECK_LEN(pos, 3);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 12);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ } else if (this_char == 0xf4) {
+ utf = (this_char & 0x7) << 18;
+ CHECK_LEN(pos, 3);
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0x8f) {
+ utf |= ((this_char & 0x3f) << 12);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
+ }
+
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= ((this_char & 0x3f) << 6);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
}
- if (more) {
- this_char = str[pos++];
- MB_WRITE((unsigned char)this_char);
+ this_char = str[pos++];
+ MB_WRITE((unsigned char)this_char);
+ if (this_char >= 0x80 && this_char <= 0xbf) {
+ utf |= (this_char & 0x3f);
+ this_char = (unsigned short)utf;
+ } else {
+ *status = FAILURE;
}
- } while (more);
+
+ } else if (this_char > 0x7f) {
+ *status = FAILURE;
+ }
}
break;
case cs_big5:
@@ -569,6 +679,7 @@
(this_char >= 0xe0 && this_char <= 0xef)
) {
/* peek at the next char */
+ CHECK_LEN(pos, 1);
unsigned char next_char = str[pos];
if ((next_char >= 0x40 && next_char <= 0x7e) ||
(next_char >= 0x80 && next_char <= 0xfc))
@@ -578,8 +689,13 @@
MB_WRITE(next_char);
this_char |= next_char;
pos++;
+ } else {
+ *status = FAILURE;
}
-
+ } else if (this_char > 0x7f &&
+ !(this_char >= 0xa1 && this_char <= 0xdf))
+ {
+ *status = FAILURE;
}
break;
}
@@ -588,6 +704,7 @@
/* check if this is the first of a multi-byte sequence */
if (this_char >= 0xa1 && this_char <= 0xfe) {
/* peek at the next char */
+ CHECK_LEN(pos, 1);
unsigned char next_char = str[pos];
if (next_char >= 0xa1 && next_char <= 0xfe) {
/* yes, this a jis kanji char */
@@ -595,10 +712,12 @@
MB_WRITE(next_char);
this_char |= next_char;
pos++;
+ } else {
+ *status = FAILURE;
}
-
} else if (this_char == 0x8e) {
/* peek at the next char */
+ CHECK_LEN(pos, 1);
unsigned char next_char = str[pos];
if (next_char >= 0xa1 && next_char <= 0xdf) {
/* JIS X 0201 kana */
@@ -606,10 +725,12 @@
MB_WRITE(next_char);
this_char |= next_char;
pos++;
+ } else {
+ *status = FAILURE;
}
-
} else if (this_char == 0x8f) {
/* peek at the next two char */
+ CHECK_LEN(pos, 2);
unsigned char next_char = str[pos];
unsigned char next2_char = str[pos+1];
if ((next_char >= 0xa1 && next_char <= 0xfe) &&
@@ -623,8 +744,11 @@
MB_WRITE(next2_char);
this_char |= next2_char;
pos++;
+ } else {
+ *status = FAILURE;
}
-
+ } else if (this_char > 0x7f) {
+ *status = FAILURE;
}
break;
}
@@ -855,6 +979,8 @@
+#define STR_EMPTY_ALLOC() estrndup("", sizeof("")-1)
+
/* {{{ php_escape_html_entities
*/
PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC)
@@ -874,8 +1000,18 @@
while (i < oldlen) {
unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */
int mbseqlen = sizeof(mbsequence);
- unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen);
+ int status = SUCCESS;
+ unsigned short this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status);
+ if(status == FAILURE) {
+ /* invalid MB sequence */
+ efree(replaced);
+ if(!PG(display_errors)) {
+ php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument");
+ }
+ *newlen = 0;
+ return STR_EMPTY_ALLOC();
+ }
matches_map = 0;
if (len + 16 > maxlen)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment