Created
October 23, 2016 11:48
-
-
Save ebuildy/18fb856ac3aa0b7f781c85a7492756e5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* A simple Php script to remove duplicates content from elasticsearch (> 2.0) | |
* Usage: | |
* php remove_dup.php HOST INDEX(/TYPE) QUERY_STRING FIELD REFRESH_TIMEOUT | |
* | |
* This script uses terms aggregation > top_hits to retrieve duplicates then bulk delete document by id. | |
* | |
* <!> Very long because we have to wait refresh_interval seconds before new search <!> | |
*/ | |
$argHost = $argv[1]; | |
$argIndex = $argv[2]; | |
$argQuery = $argv[3]; | |
$argField = $argv[4]; | |
$argPauseTime = $argv[5]; | |
$duplicates = []; | |
$esQueryQuery = [ | |
'query' => [ | |
'query_string' => [ | |
'query' => $argQuery | |
] | |
] | |
]; | |
$esFindDuplicatesAgg = [ | |
'dup' => [ | |
'terms' => [ | |
'field' => $argField, | |
'min_doc_count' => 2 | |
], | |
'aggs' => [ | |
'doc' => [ | |
'top_hits' => [ | |
'size' => 1, | |
'fields' => [], | |
'sort' => [ | |
'date' => 'asc' | |
] | |
] | |
] | |
] | |
] | |
]; | |
while(true) | |
{ | |
$esQuery = [ | |
'size' => 0, | |
'query' => $esQueryQuery, | |
'aggs' => $esFindDuplicatesAgg | |
]; | |
$response = queryES('POST', '/_search', $esQuery); | |
if (!isset($response->aggregations->dup->buckets)) | |
{ | |
echo sprintf('We are done!%s', PHP_EOL); | |
exit(1); | |
} | |
$buckets = $response->aggregations->dup->buckets; | |
echo sprintf('- Fetch %d buckets - Other %d buckets %s', count($buckets), $response->aggregations->dup->sum_other_doc_count, PHP_EOL); | |
foreach($buckets as $bucket) | |
{ | |
echo sprintf('%s Removing %d documents for %s:%s %s', "\t", $bucket->doc_count - 1, $argField, $bucket->key, PHP_EOL); | |
$r = queryES('DELETE', '_query?' . http_build_query([ | |
'q' => sprintf('%s:%s AND NOT _id:%s', $argField, $bucket->key, $bucket->doc->hits->hits[0]->_id) | |
])); | |
} | |
sleep(1 + $argPauseTime); | |
} | |
function queryES($method, $service, $data = null) | |
{ | |
global $argHost, $argIndex; | |
$ch = curl_init(sprintf("http://%s:9200/%s/%s", $argHost, $argIndex, $service)); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
if (!empty($data)) { | |
curl_setopt($ch, CURLOPT_POSTFIELDS, is_string($data) ? $data : json_encode($data)); | |
} | |
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method); | |
$rawResponse = curl_exec($ch); | |
$response = json_decode($rawResponse); | |
curl_close($ch); | |
return $response; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment