Created
November 12, 2009 07:58
-
-
Save mgng/232708 to your computer and use it in GitHub Desktop.
簡易形態素解析とマルコフ連鎖もどきで文字圧縮
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var DocCompRep = { | |
isMSIE : /*@cc_on!@*/false, | |
doIE : function(str){ | |
var range = document.selection.createRange(); | |
try{range.pasteHTML(str);}catch(e){} | |
}, | |
doFF : function(str){ | |
var w = window; | |
var d = document; | |
var frg = d.createDocumentFragment(); | |
var div = d.createElement('div'); | |
div.innerHTML = str; | |
while (div.firstChild){ | |
frg.appendChild(div.firstChild); | |
} | |
var sel = w.getSelection(); | |
var range = sel.getRangeAt(0); | |
range.deleteContents(); | |
var cont = range.startContainer; | |
var offset = range.startOffset; | |
switch (cont.nodeType) { | |
case 1: // Element node | |
cont.insertBefore(frg, cont.childNodes[offset]); | |
break; | |
case 3: // Text node | |
var node = cont.splitText(offset); | |
node.parentNode.insertBefore(frg, node); | |
break; | |
} | |
}, | |
reg1 : new RegExp('[一-龠々〆ヵヶ]+|[ぁ-ん]+|[ァ-ヴー]+|[\uFF65-\uFF9F]+|[a-zA-Z0-9]+|[a-zA-Z0-9]+|[,\.\-\_\~\^\=\@\'、。!!?\?・]+|(.*?)|「.*?」|『.*?』|【.*?】|〈.*?〉|\(.*?\)|\[.*?\]|\{.*?\}|\<.*?\>|[\s ]+', 'g'), | |
reg2 : new RegExp('(でなければ|そういえば|そういや|どうにも|こうにも|について|そりゃ|ぐらい|くらい|ながら|ならば|までを|までの|なのか|として|です|ます|つつ|だに|まで|とは|とて|なら|から|まで|して|だけ|より|にて|ほど|など|って|では)', 'g'), | |
trim : function(str){return str.replace(/^[\s ]+|[\s ]+$/g, '');}, | |
morpheme : function(str){ | |
var s=str.replace(this.reg2, "$1|"),a=s.split("|"),rt=[],i=0,l=a.length; | |
for (; i<l; i++) { | |
var ts=a[i].match(this.reg1); | |
if(ts){ | |
for(var n=0,tl=ts.length; n<tl; n++){ | |
var t=this.trim(ts[n]); | |
if(t!==''){rt.push(t);} | |
} | |
} | |
} | |
return rt; | |
}, | |
markov : function(s, c_len){ | |
s=this.trim(s); | |
c_len=c_len||2; | |
if(s==''){return '';} | |
var s_ls = this.morpheme(s); | |
var s_ls_ln = s_ls.length; | |
var c_idx = 0; | |
var rt = []; | |
var lw = ''; | |
var i=0,j=0; | |
if(s_ls_ln <= c_len){return s_ls.join('');} | |
s_ls.push(false); | |
s_ls_ln++; | |
for(i=0; i<c_len; i++) { | |
rt.push(s_ls[i]); | |
lw=s_ls[i]; | |
} | |
for(i=0; i<s_ls_ln; i++) { | |
var idxs = []; | |
for(j=c_idx; j<s_ls_ln; j++) { | |
if (s_ls[j] == lw && s_ls[j+1]) { | |
idxs.push(j+1); | |
} | |
} | |
var idx_ln = idxs.length; | |
if (idx_ln == 0) { | |
return rt.join(''); | |
} | |
c_idx=(idx_ln<=3)?idxs[idx_ln-1]:idxs[Math.ceil(idx_ln/2)-1]; | |
for(j=c_idx; j<c_idx+c_len; j++) { | |
if (s_ls[j] === false) {return rt.join('');} | |
rt.push(s_ls[j]); | |
lw=s_ls[j]; | |
} | |
} | |
return rt.join(''); | |
}, | |
init:function(){ | |
var d=document,w=window,b=d.createElement('input'); | |
b.type='button'; | |
b.value='圧縮'; | |
b.style.zIndex=9999; | |
b.style.position='fixed'; | |
b.style.top='2em'; | |
b.style.left='50%'; | |
b.style.display='block'; | |
b.onclick=function(){DocCompRep.run();}; | |
d.body.appendChild(b); | |
}, | |
run : function(){ | |
var s=(window.getSelection?window.getSelection():document.selection.createRange().text)+''; | |
s = this.markov(s); | |
if (s == '') {return;} | |
s = '<span style="font-weight:bold">'+s+'</span>'; | |
(this.isMSIE) ? this.doIE(s) : this.doFF(s); | |
} | |
}; | |
DocCompRep.init(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment