Created
May 5, 2012 02:47
-
-
Save yapcheahshen/2599244 to your computer and use it in GitHub Desktop.
calculate bigram in a text file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
text file cannot contain surrogate due to bugs in node.js | |
*/ | |
var fs = require('fs'); | |
var total_size=0; | |
var filetoarraySync = function (filename, encoding) { //array | |
encoding = encoding || 'utf8'; | |
var data = fs.readFileSync(filename, encoding); | |
if (data.indexOf("\r\n") > -1) | |
return data.split("\r\n"); | |
else | |
return data.split("\n"); | |
} | |
var main=function() { | |
var argv=process.argv; | |
if (argv[0]==="node") argv.shift(); | |
var fn=argv[1]; | |
//var f=fs.createReadStream('diary.xml'); | |
console.log("loading "+fn); | |
var arr=filetoarraySync(fn+'_acc.xml'); | |
console.log("loaded"); | |
var output=fs.createWriteStream(fn+'_bigram.js'); | |
var intag=false; | |
var bigram={}; | |
var prev=""; | |
var code=0; | |
var tokencount=0,uniquetokencount=0; | |
for (var i in arr) { | |
var s=arr[i]; | |
if (i % 1024===0) console.log( tokencount,uniquetokencount); | |
for (j in s) { | |
var c=s[j]; | |
if (c=="<") intag=true; | |
if (c==">") { intag=false;continue}; | |
if (intag) continue; | |
if (c=="的") continue; | |
code=s.charCodeAt(j); | |
if (code<0x4e00 || code>0x9fff) { | |
prev=""; | |
continue; | |
} | |
if (!bigram[ c ] ) { | |
bigram[c]={};//create a new object | |
uniquetokencount++; | |
} | |
tokencount++; | |
if (prev) { | |
//output.write(prev+c +'\n'); | |
if (! bigram[prev][c]) { | |
//console.log( "new pair"+prev+s[j] ); | |
bigram[prev][c]=1; | |
} else { | |
bigram[prev][c]++; | |
} | |
} | |
prev=c; | |
} | |
// if (tokencount>100000) break; | |
}; | |
var maxchild=15; | |
var minfreq=2; | |
var trimmed={}; | |
//calculate some figures | |
var stat={}; | |
stat.headcount=0; | |
stat.paircount=0; | |
stat.hitcount=0; | |
var charfreq={}; | |
for (var i in bigram) { | |
stat.headcount++; | |
for (var j in bigram[i]) { | |
stat.paircount++; | |
stat.hitcount+=bigram[i][j] ; | |
if (!charfreq[j] ) charfreq[j]=0; | |
charfreq[j]++; | |
} | |
} | |
stat.averagepair=stat.paircount/ stat.headcount; | |
stat.averagehit=stat.hitcount/ stat.paircount; | |
console.log(stat); | |
//if (stat.averagehit>minfreq) minfreq=stat.averagehit; | |
//字頻。 | |
//TODO 出現次數 除以 ln 字頻 ,去除構詞能力太強的字 | |
for (var j in charfreq) { | |
charfreq[j]=Math.log(charfreq[j]+2); | |
// console.log(j,charfreq[j]); | |
} | |
for (var i in bigram) { | |
var child=""; | |
var sortable=[]; | |
var totalhit=0; | |
var averagehit=0; | |
if (!bigram[i]) continue; | |
for (var j in bigram[i]) totalhit+=bigram[i][j]; | |
averagehit=totalhit / sortable.length; | |
if (!totalhit) continue; | |
//var weight=Math.log(totalhit/stat.averagehit ); //常用字較大 | |
//output.write( i+ Math.log(2+ totalhit/stat.averagehit )+"\n"); | |
for (var j in bigram[i]) sortable.push( [ j , bigram[i][j] ] ); | |
sortable.sort( function(a,b) {return b[1]-a[1]}); | |
for (var j=0;j<maxchild && j < sortable.length;j++) { | |
if (sortable[j][1]< (minfreq) ) break; | |
//if (sortable[j][1]<averagehit) break; | |
child+=sortable[j][0]; | |
} | |
if (child) trimmed[i]=child; | |
} | |
output.write("define([],function() {return {payload:{"); | |
for (var i in trimmed) { | |
output.write('"'+ i + '":"'+trimmed[i] +'",\n'); | |
} | |
output.write("},header:{version:20120419} }});"); | |
console.log("end"); | |
output.end(); | |
return; | |
}// | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
if (code<0x4e00 || code>0x9fff)
other than CJK Unified Ideographs [U+4E00 → U+9FFF], may also consider to include CJK Unified Ideographs Extension A [U+3400 → U+4DBF]. anyway, this may not apply to CBETA which seems to always "使用半型中括號括住組字式(組字法說明如下),用以代表缺字", and it does not include any of the characters from the Extension blocks.
if (c=="的") continue;
//TODO 出現次數 除以 ln 字頻 ,去除構詞能力太強的字
other than that, may also consider to exclude certain 虛字或語氣, e.g. 之、乎、也、焉、乃、而、矣。