Last active
August 29, 2015 14:09
-
-
Save masayuki5160/545fd6ee8e7b7c970e09 to your computer and use it in GitHub Desktop.
転置インデックスの作成テスト①
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE HTML> | |
<html> | |
<head> | |
<meta charset="UTF-8"> | |
<title>転置インデックス作成</title> | |
</head> | |
<body> | |
<form name="input" action="indexer.php" method="post"> | |
転置インデックスを作成するテキストを入力してください: | |
<input type="text" name="doc"><br> | |
<input type="submit"> | |
</form> | |
</body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE HTML> | |
<html> | |
<head> | |
<meta charset="UTF-8"> | |
<title>転置インデックス作成</title> | |
</head> | |
<body> | |
<?php | |
/** DB接続設定(PDOを利用) */ | |
$dsn = 'mysql:dbname=INVERTED_INDEX;host=localhost;charset=utf8'; | |
$user = 'root'; | |
$password = ''; | |
$options = array( | |
PDO::MYSQL_ATTR_READ_DEFAULT_FILE => '/etc/my.cnf', | |
); | |
$dbh = new PDO($dsn, $user, $password, $options); | |
$mecab = new MeCab_Tagger(); | |
$textTitle = ""; | |
$textInput = rtrim(htmlspecialchars($_POST['doc'])); | |
if( !empty($textInput) ){ | |
// documentのDB登録 | |
$registDocSql = 'insert into documents (TITLE, BODY) values (?, ?)'; | |
$registStmt = $dbh->prepare($registDocSql); | |
$registStmt->execute(array($textTitle, $textInput)); | |
// 登録したdocumentIDの取得 | |
$documentID = $dbh->lastInsertId(); | |
// Mecabでの形態素解析処理start | |
echo "[形態素に分解した結果]<br/>"; | |
// 転置インデックス用の連想配列 | |
// array( | |
// key(インデックス) => value(ポスティングリスト(出現するドキュメントID) | |
// .... | |
// ) | |
$newInvertedIndex = array(); | |
// Mecabでの解析結果を改行コードで分割 | |
$resultSet = explode( "\n" , $mecab->parse( $textInput ) ); | |
foreach( $resultSet as $eachResult ){ | |
if( substr( $eachResult , 0 , 3 ) !== 'EOS' ){ | |
list( $eachMorpheme , $eachInfo ) = explode( "\t" , $eachResult ); | |
echo $eachMorpheme. "<br/>"; | |
$existCheckStmt = $dbh->prepare("SELECT * FROM tokens WHERE TOKEN = '". $eachMorpheme. "'"); | |
$existCheckStmt->execute(); | |
$existCheckResult = $existCheckStmt->fetchAll(); | |
if( empty($existCheckResult) ){ | |
$newInvertedIndex[$eachMorpheme] = array($documentID); | |
// invertedIndexをDBへデータ追加 | |
$sql = "insert into tokens (TOKEN, DOCS_COUNT, POSTINGS) values (?, ?, ?)"; | |
$stmt = $dbh->prepare($sql); | |
$stmt->execute(array($eachMorpheme, "1", $documentID)); | |
}else{ | |
if(isset($newInvertedIndex[$eachMorpheme])){ | |
// 同じドキュメント内ですでに登録されいるとき | |
}else{ | |
$newPostings = $existCheckResult[0]["POSTINGS"]. ",". $documentID; | |
$sql = "update tokens SET DOCS_COUNT = DOCS_COUNT + 1, POSTINGS = '". $newPostings | |
."' WHERE ID = ". $existCheckResult[0]["ID"]; | |
$stmt = $dbh->prepare($sql); | |
$stmt->execute(); | |
$newInvertedIndex[$eachMorpheme] = array($newPostings); | |
} | |
} | |
}else{ | |
break; | |
} | |
} | |
}else{ | |
echo "処理を中断します。<br/>"; | |
} | |
?> | |
<a href="index.html">TOP</a> | |
</body> | |
</html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql セットアップ | |
sudo yum install mysql php-mysql | |
sudo yum install mysql-server | |
sudo /etc/init.d/mysqld start | |
CREATE DATABASE INVERTED_INDEX DEFAULT CHARACTER SET utf8; | |
CREATE TABLE documents( | |
ID INT NOT NULL PRIMARY KEY AUTO_INCREMENT, | |
TITLE TEXT NOT NULL, | |
BODY TEXT NOT NULL | |
); | |
CREATE TABLE tokens( | |
ID INT NOT NULL PRIMARY KEY AUTO_INCREMENT, | |
TOKEN VARCHAR(255) NOT NULL, | |
DOCS_COUNT INT NOT NULL, | |
POSTINGS TEXT NOT NULL, | |
INDEX token_index(TOKEN) | |
); | |
PDOでの処理 | |
参考:PHPでPDOを使ってMySQLに接続、INSERT、UPDATE、DELETE、COUNT、SUM | |
http://qiita.com/tabo_purify/items/2575a58c54e43cd59630 | |
複数データの挿入 | |
http://detail.chiebukuro.yahoo.co.jp/qa/question_detail/q1080059565 | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> select * from tokens; | |
+----+--------+------------+----------+ | |
| ID | TOKEN | DOCS_COUNT | POSTINGS | | |
+----+--------+------------+----------+ | |
| 27 | 田中 | 1 | 1 | | |
| 28 | 君 | 1 | 1 | | |
| 29 | の | 1 | 1 | | |
| 30 | 友達 | 1 | 1 | | |
| 31 | 田中 | 1 | 2 | | |
| 32 | 君 | 1 | 2 | | |
| 33 | は | 1 | 2 | | |
| 34 | 調子 | 1 | 2 | | |
| 35 | いい | 1 | 2 | | |
| 36 | の | 1 | 2 | | |
| 37 | かも | 1 | 2 | | |
| 38 | しれ | 1 | 2 | | |
| 39 | ない | 1 | 2 | | |
+----+--------+------------+----------+ | |
13 rows in set (0.00 sec) | |
mysql> select * from documents; | |
+----+-------+-----------------------------------------------+ | |
| ID | TITLE | BODY | | |
+----+-------+-----------------------------------------------+ | |
| 1 | | 田中君の友達の田中君 | | |
| 2 | | 田中君は調子いいのかもしれない | | |
+----+-------+-----------------------------------------------+ | |
2 rows in set (0.00 sec) |
既存の転置インデックスの存在チェックを実装。
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
下記について未実装
・転置インデックス作成時に既存の転置インデックス内にtermが存在するかのチェックとそれをふまえた処理
・RDBだとめんどうだね。Key-ValueタイプのDBへかえたい。
・もちろんクローラーなどない。