Last active
August 29, 2015 14:15
-
-
Save Leko/6c98685bdb048b949392 to your computer and use it in GitHub Desktop.
FuelPHPでMecabを使用して分かち書きするオブザーバ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class Model_Book extends \Orm\Model | |
{ | |
const DROP_WORD_LENGTH = 2; | |
protected static $_properties = array( | |
'id', | |
'title', | |
'content', | |
'content_splited', | |
'created_at', | |
'updated_at', | |
); | |
protected static $_observers = array( | |
'Orm\Observer_CreatedAt' => array( | |
'events' => array('before_insert'), | |
'mysql_timestamp' => true, | |
), | |
'Orm\Observer_UpdatedAt' => array( | |
'events' => array('before_insert', 'before_update'), | |
'mysql_timestamp' => true, | |
), | |
'Model\Observer\Wakati' = array( | |
'events' => array('before_insert', 'before_update'), | |
'wakati_from' => 'content', | |
'wakati_to' => 'content_splited', | |
'drop_word_length' => self::DROP_WORD_LENGTH, | |
), | |
); | |
public function similarBooks() | |
{ | |
return self::query() | |
->where(\DB::expr('MATCH(content_splited)'), 'AGAINST', \DB::fulltext($this->title, self::DROP_WORD_LENGTH)) | |
->get(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Fuelのコアを拡張します | |
// Core の拡張 - 概要 - FuelPHP ドキュメント | |
// http://fuelphp.jp/docs/1.7/general/extending_core.html | |
class DB extends \Fuel\Core\DB | |
{ | |
/** | |
* 全文検索用の生SQL文のオブジェクトを生成する | |
* | |
* @param string $search_text マッチさせるテキスト(内部で分かち書きを行う) | |
* @param string $drop_length 分かち書きした文字列をふるい落とす最小文字列(デフォルトは何もしない) | |
* @return Database_Expression 全文検索用のSQL文のインスタンス | |
*/ | |
public static function fulltext($search_text, $drop_length = INF) | |
{ | |
$search_words = mecab_split($search_text); | |
foreach($search_words as $i => &$word) { | |
if(mb_strlen($word) < $drop_length) { | |
unset($search_words[$i]); | |
continue; | |
} | |
$word = ('+' . $word); | |
} | |
$search_token = \DB::escape(implode(' ', $search_words)); | |
return \DB::expr("({$search_token} IN BOOLEAN MODE)"); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace Model\Observer; | |
class Wakati extends \Orm\Observer | |
{ | |
private $wakati_from; | |
private $wakati_to; | |
private $wakati_glue = ' '; | |
private $drop_word_length = INF; | |
/** | |
* Set the properties for this observer instance, based on the parent model's | |
* configuration or the defined defaults. | |
* | |
* @param string Model class this observer is called on | |
*/ | |
public function __construct($class) | |
{ | |
$options = $class::observers(get_class($this)); | |
$this->validate_options($options); | |
foreach($options as $key => $value) { | |
$this->{$key} = $value; | |
} | |
} | |
/** | |
* 挿入前に分かち書きを行う | |
* | |
* @param \Model_Abstract $model 挿入前のモデルのインスタンス | |
* @return void | |
*/ | |
public function before_insert(\Model_Abstract $model) | |
{ | |
$this->wakati($model); | |
} | |
/** | |
* 更新前に分かち書きを行う | |
* | |
* @param \Model_Abstract $model 更新前のモデルのインスタンス | |
* @return void | |
*/ | |
public function before_update(\Model_Abstract $model) | |
{ | |
$this->wakati($model); | |
} | |
/** | |
* 渡されたモデルに対して分かち書きの複写を行う | |
* | |
* @param \Model_Abstract $model 更新前のモデルのインスタンス | |
* @return void | |
*/ | |
private function wakati(\Model_Abstract $model) | |
{ | |
$this->validate_enabled_mecab(); | |
$raw_text = $model->{$this->wakati_from}; | |
$wakati_words = mecab_split($raw_text); | |
$wakati_words = $this->filter_words($wakati_words); | |
$wakati_text = implode($this->wakati_glue, $wakati_words); | |
$model->{$this->wakati_to} = $wakati_text; | |
} | |
/** | |
* データ容量節約・検索速度向上のために短い単語をふるい落とす | |
* | |
* @param string[] $wakati_words 分かち書きされた単語の配列 | |
* @return string[] ふるい落としを行った後の単語の配列 | |
*/ | |
private function filter_words(array $wakati_words) | |
{ | |
$min_length = $this->drop_word_length; | |
if($min_length === INF) return $wakati_words; | |
$filtered_words = array_filter($wakati_words, function($word) use ($min_length) { | |
return (mb_strlen($word) >= $min_length); | |
}); | |
return $filtered_words; | |
} | |
/** | |
* オプションのバリデーションを行う | |
* | |
* @param array $options オブザーバに渡されるオプション | |
* @return void | |
* @throws \Exception If empty 'wakati_from' or 'wakati_to' option | |
*/ | |
private function validate_options(array $options) | |
{ | |
if(!isset($options['wakati_from']) || !isset($options['wakati_to'])) { | |
throw new \Exception("option 'wakati_from' and 'wakati_to' is required"); | |
} | |
} | |
/** | |
* Mecab拡張がインストールされているか否かを検証する | |
* | |
* このオブザーバはMecab拡張の使用を前提としているため事前にチェックし例外を出す | |
* | |
* @return void | |
* @throws \Exception If 'mecab' extension not loaded | |
*/ | |
private function validate_enabled_mecab() | |
{ | |
if(!extension_loaded('mecab')) { | |
throw new \Exception("'mecab' extension is required"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment