Created
April 21, 2012 08:22
-
-
Save iwamot/2435590 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- ext/standard/html.c.org 2007-12-31 16:22:52.000000000 +0900 | |
+++ ext/standard/html.c 2012-03-07 23:33:19.000000000 +0900 | |
@@ -439,19 +439,29 @@ | |
} \ | |
mbseq[mbpos++] = (mbchar); } | |
+#define CHECK_LEN(pos, chars_need) \ | |
+ if((str_len - (pos)) < chars_need) { \ | |
+ *newpos = pos; \ | |
+ return 0; \ | |
+ } | |
+ | |
/* {{{ get_next_char | |
*/ | |
inline static unsigned short get_next_char(enum entity_charset charset, | |
unsigned char * str, | |
+ int str_len, | |
int * newpos, | |
unsigned char * mbseq, | |
- int * mbseqlen) | |
+ int * mbseqlen, | |
+ int *status) | |
{ | |
int pos = *newpos; | |
int mbpos = 0; | |
int mbspace = *mbseqlen; | |
unsigned short this_char = str[pos++]; | |
+ *status = SUCCESS; | |
+ | |
if (mbspace <= 0) { | |
*mbseqlen = 0; | |
return this_char; | |
@@ -463,83 +473,183 @@ | |
case cs_utf_8: | |
{ | |
unsigned long utf = 0; | |
- int stat = 0; | |
- int more = 1; | |
- /* unpack utf-8 encoding into a wide char. | |
- * Code stolen from the mbstring extension */ | |
+ if (this_char >= 0xc2 && this_char <= 0xdf) { | |
+ utf = (this_char & 0x1f) << 6; | |
+ CHECK_LEN(pos, 1); | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= (this_char & 0x3f); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
- do { | |
- if (this_char < 0x80) { | |
- more = 0; | |
- break; | |
- } else if (this_char < 0xc0) { | |
- switch (stat) { | |
- case 0x10: /* 2, 2nd */ | |
- case 0x21: /* 3, 3rd */ | |
- case 0x32: /* 4, 4th */ | |
- case 0x43: /* 5, 5th */ | |
- case 0x54: /* 6, 6th */ | |
- /* last byte in sequence */ | |
- more = 0; | |
- utf |= (this_char & 0x3f); | |
- this_char = (unsigned short)utf; | |
- break; | |
- case 0x20: /* 3, 2nd */ | |
- case 0x31: /* 4, 3rd */ | |
- case 0x42: /* 5, 4th */ | |
- case 0x53: /* 6, 5th */ | |
- /* penultimate char */ | |
- utf |= ((this_char & 0x3f) << 6); | |
- stat++; | |
- break; | |
- case 0x30: /* 4, 2nd */ | |
- case 0x41: /* 5, 3rd */ | |
- case 0x52: /* 6, 4th */ | |
- utf |= ((this_char & 0x3f) << 12); | |
- stat++; | |
- break; | |
- case 0x40: /* 5, 2nd */ | |
- case 0x51: | |
- utf |= ((this_char & 0x3f) << 18); | |
- stat++; | |
- break; | |
- case 0x50: /* 6, 2nd */ | |
- utf |= ((this_char & 0x3f) << 24); | |
- stat++; | |
- break; | |
- default: | |
- /* invalid */ | |
- more = 0; | |
- } | |
- } | |
- /* lead byte */ | |
- else if (this_char < 0xe0) { | |
- stat = 0x10; /* 2 byte */ | |
- utf = (this_char & 0x1f) << 6; | |
- } else if (this_char < 0xf0) { | |
- stat = 0x20; /* 3 byte */ | |
- utf = (this_char & 0xf) << 12; | |
- } else if (this_char < 0xf8) { | |
- stat = 0x30; /* 4 byte */ | |
- utf = (this_char & 0x7) << 18; | |
- } else if (this_char < 0xfc) { | |
- stat = 0x40; /* 5 byte */ | |
- utf = (this_char & 0x3) << 24; | |
- } else if (this_char < 0xfe) { | |
- stat = 0x50; /* 6 byte */ | |
- utf = (this_char & 0x1) << 30; | |
+ } else if (this_char == 0xe0) { | |
+ utf = (this_char & 0xf) << 12; | |
+ CHECK_LEN(pos, 2); | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0xa0 && this_char <= 0xbf) { | |
+ utf |= ((this_char & 0x3f) << 6); | |
+ this_char = (unsigned short)utf; | |
} else { | |
- /* invalid; bail */ | |
- more = 0; | |
- break; | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= (this_char & 0x3f); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ } else if ((this_char >= 0xe1 && this_char <= 0xec) || | |
+ (this_char >= 0xee && this_char <= 0xef)) { | |
+ utf = (this_char & 0xf) << 12; | |
+ CHECK_LEN(pos, 2); | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= ((this_char & 0x3f) << 6); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= (this_char & 0x3f); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ } else if (this_char == 0xed) { | |
+ utf = (this_char & 0xf) << 12; | |
+ CHECK_LEN(pos, 2); | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0xa0 && this_char <= 0x9f) { | |
+ utf |= ((this_char & 0x3f) << 6); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= (this_char & 0x3f); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ } else if (this_char == 0xf0) { | |
+ utf = (this_char & 0x7) << 18; | |
+ CHECK_LEN(pos, 3); | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x90 && this_char <= 0xbf) { | |
+ utf |= ((this_char & 0x3f) << 12); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= ((this_char & 0x3f) << 6); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= (this_char & 0x3f); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ } else if (this_char >= 0xf1 && this_char <= 0xf3) { | |
+ utf = (this_char & 0x7) << 18; | |
+ CHECK_LEN(pos, 3); | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= ((this_char & 0x3f) << 12); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= ((this_char & 0x3f) << 6); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= (this_char & 0x3f); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ } else if (this_char == 0xf4) { | |
+ utf = (this_char & 0x7) << 18; | |
+ CHECK_LEN(pos, 3); | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0x8f) { | |
+ utf |= ((this_char & 0x3f) << 12); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
+ } | |
+ | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= ((this_char & 0x3f) << 6); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
} | |
- if (more) { | |
- this_char = str[pos++]; | |
- MB_WRITE((unsigned char)this_char); | |
+ this_char = str[pos++]; | |
+ MB_WRITE((unsigned char)this_char); | |
+ if (this_char >= 0x80 && this_char <= 0xbf) { | |
+ utf |= (this_char & 0x3f); | |
+ this_char = (unsigned short)utf; | |
+ } else { | |
+ *status = FAILURE; | |
} | |
- } while (more); | |
+ | |
+ } else if (this_char > 0x7f) { | |
+ *status = FAILURE; | |
+ } | |
} | |
break; | |
case cs_big5: | |
@@ -569,6 +679,7 @@ | |
(this_char >= 0xe0 && this_char <= 0xef) | |
) { | |
/* peek at the next char */ | |
+ CHECK_LEN(pos, 1); | |
unsigned char next_char = str[pos]; | |
if ((next_char >= 0x40 && next_char <= 0x7e) || | |
(next_char >= 0x80 && next_char <= 0xfc)) | |
@@ -578,8 +689,13 @@ | |
MB_WRITE(next_char); | |
this_char |= next_char; | |
pos++; | |
+ } else { | |
+ *status = FAILURE; | |
} | |
- | |
+ } else if (this_char > 0x7f && | |
+ !(this_char >= 0xa1 && this_char <= 0xdf)) | |
+ { | |
+ *status = FAILURE; | |
} | |
break; | |
} | |
@@ -588,6 +704,7 @@ | |
/* check if this is the first of a multi-byte sequence */ | |
if (this_char >= 0xa1 && this_char <= 0xfe) { | |
/* peek at the next char */ | |
+ CHECK_LEN(pos, 1); | |
unsigned char next_char = str[pos]; | |
if (next_char >= 0xa1 && next_char <= 0xfe) { | |
/* yes, this a jis kanji char */ | |
@@ -595,10 +712,12 @@ | |
MB_WRITE(next_char); | |
this_char |= next_char; | |
pos++; | |
+ } else { | |
+ *status = FAILURE; | |
} | |
- | |
} else if (this_char == 0x8e) { | |
/* peek at the next char */ | |
+ CHECK_LEN(pos, 1); | |
unsigned char next_char = str[pos]; | |
if (next_char >= 0xa1 && next_char <= 0xdf) { | |
/* JIS X 0201 kana */ | |
@@ -606,10 +725,12 @@ | |
MB_WRITE(next_char); | |
this_char |= next_char; | |
pos++; | |
+ } else { | |
+ *status = FAILURE; | |
} | |
- | |
} else if (this_char == 0x8f) { | |
/* peek at the next two char */ | |
+ CHECK_LEN(pos, 2); | |
unsigned char next_char = str[pos]; | |
unsigned char next2_char = str[pos+1]; | |
if ((next_char >= 0xa1 && next_char <= 0xfe) && | |
@@ -623,8 +744,11 @@ | |
MB_WRITE(next2_char); | |
this_char |= next2_char; | |
pos++; | |
+ } else { | |
+ *status = FAILURE; | |
} | |
- | |
+ } else if (this_char > 0x7f) { | |
+ *status = FAILURE; | |
} | |
break; | |
} | |
@@ -855,6 +979,8 @@ | |
+#define STR_EMPTY_ALLOC() estrndup("", sizeof("")-1) | |
+ | |
/* {{{ php_escape_html_entities | |
*/ | |
PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newlen, int all, int quote_style, char *hint_charset TSRMLS_DC) | |
@@ -874,8 +1000,18 @@ | |
while (i < oldlen) { | |
unsigned char mbsequence[16]; /* allow up to 15 characters in a multibyte sequence */ | |
int mbseqlen = sizeof(mbsequence); | |
- unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen); | |
+ int status = SUCCESS; | |
+ unsigned short this_char = get_next_char(charset, old, oldlen, &i, mbsequence, &mbseqlen, &status); | |
+ if(status == FAILURE) { | |
+ /* invalid MB sequence */ | |
+ efree(replaced); | |
+ if(!PG(display_errors)) { | |
+ php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid multibyte sequence in argument"); | |
+ } | |
+ *newlen = 0; | |
+ return STR_EMPTY_ALLOC(); | |
+ } | |
matches_map = 0; | |
if (len + 16 > maxlen) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment