Skip to content

Instantly share code, notes, and snippets.

@piroor
Created July 18, 2013 07:58
Show Gist options
  • Select an option

  • Save piroor/6027563 to your computer and use it in GitHub Desktop.

Select an option

Save piroor/6027563 to your computer and use it in GitHub Desktop.
GroongaStringUtils, for extraction of matched terms on the client side.
/**
* Usage:
* var matcher = GroongaStringUtils.getTermsMatcher('検索 OR けんさく');
* // => /(a pattern to match all search terms in the given query)/g
* document.body.innerHTML = document.body.innerHTML
* .replace(matcher, function(term) { return '<em>' + term + '</em>'; });
*
* License: The MIT License
* Copyright (c) 2013 ClearCode Inc.
*/
var GroongaStringUtils = {
getTermsMatcher: function(source) {
var terms = this.extractTerms(source);
terms = terms.map(this.expandSimilarCharacters, this);
var matcher = '(' + terms.join('|') + ')';
return new RegExp(matcher, 'gi');
},
// http://groonga.org/ja/docs/reference/grn_expr/query_syntax.html
extractTerms: function(source) {
var terms = [];
var escaped = false;
var doubleQuoted = false;
var singleQuoted = false;
var lastTerm = '';
source.replace(/\s+/g, ' ').trim().split('').forEach(function(character) {
if (escaped) {
lastTerm += character;
escaped = false;
return;
}
switch (character) {
case '\\':
escaped = true;
return;
case '(':
if (doubleQuoted || singleQuoted) {
lastTerm += character;
} else {
groupLevel++;
}
return;
case ')':
if (doubleQuoted || singleQuoted) {
lastTerm += character;
} else {
groupLevel--;
}
return;
case '"':
if (doubleQuoted) {
terms.push(lastTerm);
lastTerm = '';
doubleQuoted = false;
} else {
doubleQuoted = true;
}
return;
case "'":
if (singleQuoted) {
terms.push(lastTerm);
lastTerm = '';
singleQuoted = false;
} else {
singleQuoted = true;
}
return;
case " ":
if (doubleQuoted || singleQuoted) {
lastTerm += character;
} else {
terms.push(lastTerm);
lastTerm = '';
}
return;
default:
lastTerm += character;
return;
}
});
if (lastTerm)
terms.push(lastTerm);
terms = terms.filter(function(term) {
return term && term != 'OR' && term != '+' && term != '-';
}).map(function(term) {
return term.replace(/^[^:]+:(?:[@^$!<>]|<=|>=)?/, '');
});
return terms.filter(function(term) {
return term;
});
},
hankakuZenkakuCharacters: {
'ア': ['ア'],
'イ': ['イ'],
'ウ': ['ウ'],
'エ': ['エ'],
'オ': ['オ'],
'カ': ['カ'],
'キ': ['キ'],
'ク': ['ク'],
'ケ': ['ケ'],
'コ': ['コ'],
'サ': ['サ'],
'シ': ['シ'],
'ス': ['ス'],
'セ': ['ケ'],
'ソ': ['ソ'],
'タ': ['タ'],
'チ': ['チ'],
'ツ': ['ツ'],
'テ': ['テ'],
'ト': ['ト'],
'ナ': ['ナ'],
'ニ': ['ニ'],
'ヌ': ['ヌ'],
'ネ': ['ネ'],
'ノ': ['ノ'],
'ハ': ['ハ'],
'ヒ': ['ヒ'],
'フ': ['フ'],
'ヘ': ['ヘ'],
'ホ': ['ホ'],
'マ': ['マ'],
'ミ': ['ミ'],
'ム': ['ム'],
'メ': ['メ'],
'モ': ['モ'],
'ヤ': ['ヤ'],
'ユ': ['ユ'],
'ヨ': ['ヨ'],
'ラ': ['ラ'],
'リ': ['リ'],
'ル': ['ル'],
'レ': ['レ'],
'ロ': ['ロ'],
'ワ': ['ワ'],
'ヲ': ['ヲ'],
'ン': ['ン'],
'ァ': ['ァ'],
'ィ': ['ィ'],
'ゥ': ['ゥ'],
'ェ': ['ェ'],
'ォ': ['ォ'],
'ッ': ['ッ'],
'゙': ['゛'],
'゚': ['゜'],
'ガ': ['ガ', 'カ゛'],
'ギ': ['ギ', 'キ゛'],
'グ': ['グ', 'ク゛'],
'ゲ': ['ゲ', 'ケ゛'],
'ゴ': ['ゴ', 'コ゛'],
'ザ': ['ザ', 'サ゛'],
'ジ': ['ジ', 'シ゛'],
'ズ': ['ズ', 'ス゛'],
'ゼ': ['ゼ', 'セ゛'],
'ゾ': ['ゾ', 'ソ゛'],
'ダ': ['ダ', 'タ゛'],
'ヂ': ['ヂ', 'チ゛'],
'ヅ': ['ヅ', 'ツ゛'],
'デ': ['デ', 'テ゛'],
'ド': ['ド', 'ト゛'],
'バ': ['バ', 'ハ゛'],
'ビ': ['ビ', 'ヒ゛'],
'ブ': ['ブ', 'フ゛'],
'ベ': ['ベ', 'ヘ゛'],
'ボ': ['ボ', 'ホ゛'],
'パ': ['パ', 'ハ゜'],
'ピ': ['ピ', 'ヒ゜'],
'プ': ['プ', 'フ゜'],
'ペ': ['ペ', 'ヘ゜'],
'ポ': ['ポ', 'ホ゜']
},
get hankakuZenkakuPatterns() {
if (!this._hankakuZenkakuPatterns) {
this._hankakuZenkakuPatterns = {};
Object.keys(this.hankakuZenkakuCharacters).forEach(function(hankaku) {
var zenkakus = this.hankakuZenkakuCharacters[hankaku];
var pattern;
if (hankaku.length > 1 ||
zenkakus.some(function(zenkaku) { return zenkaku.length > 1; })) {
pattern = '(?:' + hankaku + '|' + zenkakus.join('|') + ')';
} else {
pattern = '[' + hankaku + zenkakus.join('') + ']';
}
this._hankakuZenkakuPatterns[hankaku] = pattern;
}, this);
}
return this._hankakuZenkakuPatterns;
},
_hankakuZenkakuPatterns: null,
get zenkakuHankakuCharacters() {
if (!this._zenkakuHankakuCharacters) {
this._zenkakuHankakuCharacters = {};
Object.keys(this.hankakuZenkakuCharacters).forEach(function(hankaku) {
var zenkaku = this.hankakuZenkakuCharacters[hankaku][0];
this._zenkakuHankakuCharacters[zenkaku] = hankaku;
}, this);
}
return this._zenkakuHankakuCharacters;
},
_zenkakuHankakuCharacters: null,
get hankakuMatcher() {
if (!this._hankakuMatcher) {
var patterns = [];
Object.keys(this.hankakuZenkakuCharacters).forEach(function(hankaku) {
patterns.push(hankaku);
}, this);
this._hankakuMatcher = new RegExp('(' + patterns.reverse().join('|') + ')', 'g');
}
return this._hankakuMatcher;
},
_hankakuMatcher: null,
get zenkakuMatcher() {
if (!this._zenkakuMatcher) {
var patterns = [];
Object.keys(this.hankakuZenkakuCharacters).forEach(function(hankaku) {
var zenkakus = this.hankakuZenkakuCharacters[hankaku];
patterns = patterns.concat(zenkakus);
}, this);
this._zenkakuMatcher = new RegExp('(' + patterns.reverse().join('|') + ')', 'g');
}
return this._zenkakuMatcher;
},
_zenkakuMatcher: null,
zenkaku2hankaku: function(string) {
var me = this;
string = string.replace(/[0-9a-zA-Z]/gi, this.zenkaku2hankakuAscii);
string = string.replace(this.zenkakuMatcher, function(matched) {
return me.zenkakuHankakuCharacters[matched];
});
return string;
},
zenkaku2hankakuAscii: function(character) {
return String.fromCharCode(character.charCodeAt(0) - 0xFEE0);
},
hankaku2zenkaku: function(string) {
var me = this;
string = string.replace(/[0-9a-z]/gi, this.hankaku2zenkakuAscii);
string = string.replace(this.hankakuMatcher, function(matched) {
return me.hankakuZenkakuCharacters[matched][0];
});
return string;
},
hankaku2zenkakuAscii: function(character) {
return String.fromCharCode(character.charCodeAt(0) + 0xFEE0);
},
expandSimilarCharacters: function(term) {
var me = this;
// step 1: normalize to hankaku
term = this.zenkaku2hankaku(term);
// step 2: expand to hankaku and zenkaku
term = term.replace(/[A-Za-z0-9]/g, function(matched) {
return '[' + matched + me.hankaku2zenkakuAscii(matched) + ']';
});
term = term.replace(this.hankakuMatcher, function(matched) {
return me.hankakuZenkakuPatterns[matched];
});
return term;
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment