Created
May 27, 2012 15:10
-
-
Save wondger/2814635 to your computer and use it in GitHub Desktop.
packet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* @name:SplitURI.js | |
* @description:分割URL | |
* 最小分割单元为12个字符长度,UTF8编码中可能存在4个字节编码的字符 | |
* UTF8各字节字符编码最大值: | |
* 单字节字符:0-127 | |
* 其他字节字符末字节:128-191 | |
* 双字节字符首字节:192-223 | |
* 三字节字符首字节:224-239 | |
* 四字节字符首字节:240-247 | |
* | |
* @author:[email protected] | |
* @date:2012-05-26 | |
* @param: | |
* source[String]:source String | |
* limitLength[Number]:min-length | |
* encode[Boolean]:whether use encodeURIComponent | |
* @see:http://tools.ietf.org/html/rfc3629#section-3 | |
Char. number range | UTF-8 octet sequence | |
* (hexadecimal) | (binary) | |
* --------------------+--------------------------------------------- | |
* 0000 0000-0000 007F | 0xxxxxxx | |
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx | |
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx | |
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
* | |
* @todo: | |
* @changelog: | |
*/ | |
var SplitURI = { | |
c16to10:function(s16){ | |
var s = s16.replace(/^(0x)/i,''); | |
return parseInt(s,16); | |
}, | |
/* | |
* 检测字节完整性 %xx | |
*/ | |
checkOctet:function(urlString){ | |
var s = urlString, | |
r = /(%[0-9a-fA-F]{0,1})$/i, | |
ret = { | |
"urlString":"", | |
"brokenOctet":"" | |
}; | |
ret["urlString"] = s.replace(r,function(g){ | |
ret["brokenOctet"] = g; | |
return ''; | |
}); | |
return ret; | |
}, | |
/* | |
* 检测字符完整性 | |
*/ | |
checkCharacter:function(urlString,brokenCharacter){ | |
// 根据最后字节编码判断该字节编码是否为单字节字符,如果不是查找该字符完整的字符编码 | |
// 返回完整的UTF8字节编码,和尾部的字节所在字符编码片段 | |
var s = urlString, | |
r = /(?:%([0-9a-fA-F]{2}))$/i, | |
ret = { | |
"urlString":"", | |
"brokenCharacter":brokenCharacter || '' | |
}, | |
isIntact = true; | |
s = s.replace(r,function(g,octet){ | |
var n = SplitURI.c16to10(octet); | |
// 判断末字节编码 | |
if(n<128){ | |
// 单字节字符 | |
isIntact = true; | |
return g; | |
}else if(n<=191){ | |
// 多字节字符非首字节 | |
isIntact = false; | |
}else if(n<=247){ | |
// 多字节字符首字节 | |
isIntact = true; | |
} | |
// 非单字节字符编码结尾都为非完整字符编码,移除保存到ret["brokenCharacter"] | |
ret["brokenCharacter"] = g + ret["brokenCharacter"]; | |
return ''; | |
}); | |
ret["urlString"] = s; | |
if(!isIntact){ | |
// 继续校验末字节字符完整性 | |
ret = SplitURI.checkCharacter(ret["urlString"],ret["brokenCharacter"]); | |
} | |
return ret; | |
}, | |
split:function(s,limitLength,encode){ | |
if(!limitLength || limitLength<12) return []; | |
var s = !!encode ? encodeURIComponent(s) : s, | |
r = /^(?:%([0-9a-fA-F]{2}))/i, | |
ret = [], | |
t = '', | |
isIntact, | |
o, | |
c; | |
while(s){ | |
isIntact = true; | |
t = s.slice(0,limitLength); | |
s = s.slice(limitLength); | |
// 检测字节编码完整性 | |
o = SplitURI.checkOctet(t); | |
// 编码片段存入下一个片段 | |
s = o["brokenOctet"] + s; | |
if(s){ | |
s.replace(r,function(g,octet){ | |
var n = SplitURI.c16to10(octet); | |
// 下一个分段不是以10******的字符编码开头 | |
// 说明上一个分段是一个完整的字符编码 | |
if(n>=128 && n<=191){ | |
isIntact = false; | |
} | |
return ''; | |
}) | |
} | |
if(isIntact){ | |
ret.push(o["urlString"]); | |
}else{ | |
// o["urlString"]不是完整的字符编码 | |
c = SplitURI.checkCharacter(o["urlString"]); | |
s = c["brokenCharacter"] + s; | |
ret.push(c["urlString"]); | |
} | |
} | |
return ret; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment