Created
July 19, 2011 22:32
-
-
Save nmmmnu/1093910 to your computer and use it in GitHub Desktop.
Indexing library for PHP/MySQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE `data_index` ( | |
`id` int(11) NOT NULL, | |
`num` smallint(6) NOT NULL, | |
`word` varchar(33) DEFAULT NULL, | |
PRIMARY KEY (`id`,`num`), | |
KEY `word` (`word`,`id`), | |
KEY `id` (`id`,`word`) | |
); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<? | |
require_once "lib_ix.php"; | |
mysql_connect('localhost', 'root', 'pass'); | |
mysql_select_db('database'); | |
mysqlix_index("data", "id", "`desc`", "data_index"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<? | |
function mysqlix_create($ixtable){ | |
return " | |
create table `$ixtable`( | |
id int not null , | |
num smallint not null , | |
word varchar(33) , | |
primary key(id, num) , | |
index(word, id) , | |
index(id, word) | |
) | |
"; | |
} | |
function mysqlix_value_format($v){ | |
$v = preg_replace('/[^0-9A-Za-z]/', ' ', $v); | |
$v = strtolower($v); | |
$vals = explode(" ", $v); | |
$vv = array(); | |
foreach($vals as $v){ | |
$v = trim($v); | |
if ($v == "") | |
continue; | |
$vv[] = mysql_real_escape_string(md5($v)); | |
} | |
return $vv; | |
} | |
function mysqlix_hot_index($k, $v, $ixtable){ | |
$vals = mysqlix_value_format($v); | |
//print_r($vals); | |
mysql_query( "DELETE FROM `$ixtable` WHERE id = $k" ); | |
$no = 0; | |
foreach($vals as $v){ | |
mysql_query( "REPLACE INTO `$ixtable` VALUES( $k, $no, '$v' )" ); | |
$no++; | |
} | |
} | |
function mysqlix_index($table, $key, $value, $ixtable=""){ | |
if ($ixtable == "") | |
$ixtable = $table . "_index"; | |
if ($value == "") | |
return; | |
$sql = "SELECT `$key` as k, $value as v FROM `$table` "; | |
mysqlix_index_sql($sql, $ixtable); | |
} | |
function mysqlix_index_sql($sql, $ixtable){ | |
$result = mysql_query( $sql ); | |
while ($row = mysql_fetch_array($result)) { | |
echo sprintf("%8s | %-100s \n", $row[0], $row[1]); | |
mysqlix_hot_index($row[0], $row[1], $ixtable); | |
} | |
} | |
function mysqlix_search($value, $ixtable, $limit=100){ | |
$v = mysqlix_value_format($value); | |
if (count($v) == 0) | |
return array(); | |
$DISTINCT = count($v) > 1 ? "DISTINCT" : ""; | |
$sql = " | |
SELECT $DISTINCT | |
t0.id as id | |
FROM | |
"; | |
for( $i = 0; $i < count($v); $i++){ | |
$sql = $sql . " | |
`$ixtable` t$i "; | |
if ( $i < count($v) -1 ) | |
$sql = $sql . ","; | |
$sql = $sql . "\n"; | |
} | |
$sql = $sql . " | |
WHERE | |
"; | |
for( $i = 0; $i < count($v); $i++){ | |
$val = $v[$i]; | |
$sql = $sql . " | |
t$i.word = '$val' AND | |
"; | |
} | |
if (count($v) > 1){ | |
for( $i = 0 + 1; $i < count($v); $i++){ | |
$sql = $sql . " | |
t0.id = t$i.id AND | |
"; | |
} | |
} | |
$sql = $sql . " | |
1 = 1 | |
LIMIT | |
$limit | |
"; | |
echo $sql; | |
$result = mysql_query($sql); | |
$x = array(); | |
while ($row = mysql_fetch_array($result)) | |
$x[] = $row['id']; | |
return array_unique($x); | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<? | |
require_once "lib_ix.php"; | |
mysql_connect('localhost', 'root', 'pass'); | |
mysql_select_db('database'); | |
$x = mysqlix_search($argv[1], "data_index"); | |
print_r($x); |
Possible fix could be dropping "num" column and making only one primary on "word" + "id", but performance is still poor.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
However, performance is poor if there are lots of data.
I tried with ~7M (index is ~28M) and if you search by 2-3 words, result comes in 4-5 sec.