Skip to content

Instantly share code, notes, and snippets.

@g4vroche
Last active October 9, 2015 11:49
Show Gist options
  • Save g4vroche/df7ffbcdd8caa0a83f36 to your computer and use it in GitHub Desktop.
Save g4vroche/df7ffbcdd8caa0a83f36 to your computer and use it in GitHub Desktop.
<?php
define('SOLR_HOST', 'http://localhost:8983/solr/collection1/');
function getFiles($path)
{
$files = scandir($path);
array_shift($files);
array_shift($files);
array_walk($files, function(&$file) use ($path) {
$file = "$path/$file";
});
return $files;
}
function pushFileViaTika($file)
{
$id = md5($file);
echo "Pushing doc $file\n with id: $id\n";
$cmd = "curl -s '".SOLR_HOST."update/extract?literal.id=$id&commit=false&uprefix=attr_&wt=json' -F 'myfile=@$file'";
//$cmd .= '> /dev/null/ 2>/dev/null &';
echo shell_exec($cmd);
}
function pushFileViaPdf2txt($file)
{
$id = md5($file);
echo "Pushing doc $file\n with id: $id\n";
$cmd = "pdf2txt '". $file."'";
$text = shell_exec($cmd);
$cmd = "curl -s '".SOLR_HOST."update?commit=false&wt=json' -d '
[
{\"id\": \"$id\",
\"attr_text\": \"".urlencode($text)."\"
}
]
'";
echo shell_exec($cmd);
}
function indexAllFiles($files, $method)
{
foreach($files as $file)
{
switch($method){
case 'tika':
pushFileViaTika($file);
break;
case 'pdf2txt':
pushFileViaPdf2txt($file);
break;
}
}
commitSolr();
}
function commitSolr()
{
$cmd = "curl -s '".SOLR_HOST."'update -d '<commit/>'";
exec($cmd);
}
///////////////////////////////
if(count($argv) < 2)
{
die("Usage: ".$argv[0]." path/to/files\n");
}
$timer = array();
$path = rtrim($argv[1],"/");
$files = getFiles($path);
$timer['tika_start'] = microtime(true);
indexAllFiles($files, 'tika');
$timer['intermediate'] = microtime(true);
indexAllFiles($files, 'pdf2txt');
$timer['pdf2txt_stop'] = microtime(true);
$durations = array(
'tika' => ($timer['intermediate'] - $timer['tika_start']),
'pdf2txt' => ($timer['pdf2txt_stop'] - $timer['intermediate'])
);
asort($durations);
echo "\n\033[1mMethod\t\tTime spent (sec)\033[0m\n";
foreach( $durations as $type => $value)
{
echo "$type\t\t".round($value,4)."\n";
}
$names = array_keys($durations);
$values = array_values($durations);
echo "\n".$names[0] ." is ". round($values[1]/$values[0],2)." times faster than ".$names[1]."\n\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment