Skip to content

Instantly share code, notes, and snippets.

@ebuildy
Created October 23, 2016 11:48
Show Gist options
  • Save ebuildy/18fb856ac3aa0b7f781c85a7492756e5 to your computer and use it in GitHub Desktop.
Save ebuildy/18fb856ac3aa0b7f781c85a7492756e5 to your computer and use it in GitHub Desktop.
<?php
/**
* A simple Php script to remove duplicates content from elasticsearch (> 2.0)
* Usage:
* php remove_dup.php HOST INDEX(/TYPE) QUERY_STRING FIELD REFRESH_TIMEOUT
*
* This script uses terms aggregation > top_hits to retrieve duplicates then bulk delete document by id.
*
* <!> Very long because we have to wait refresh_interval seconds before new search <!>
*/
$argHost = $argv[1];
$argIndex = $argv[2];
$argQuery = $argv[3];
$argField = $argv[4];
$argPauseTime = $argv[5];
$duplicates = [];
$esQueryQuery = [
'query' => [
'query_string' => [
'query' => $argQuery
]
]
];
$esFindDuplicatesAgg = [
'dup' => [
'terms' => [
'field' => $argField,
'min_doc_count' => 2
],
'aggs' => [
'doc' => [
'top_hits' => [
'size' => 1,
'fields' => [],
'sort' => [
'date' => 'asc'
]
]
]
]
]
];
while(true)
{
$esQuery = [
'size' => 0,
'query' => $esQueryQuery,
'aggs' => $esFindDuplicatesAgg
];
$response = queryES('POST', '/_search', $esQuery);
if (!isset($response->aggregations->dup->buckets))
{
echo sprintf('We are done!%s', PHP_EOL);
exit(1);
}
$buckets = $response->aggregations->dup->buckets;
echo sprintf('- Fetch %d buckets - Other %d buckets %s', count($buckets), $response->aggregations->dup->sum_other_doc_count, PHP_EOL);
foreach($buckets as $bucket)
{
echo sprintf('%s Removing %d documents for %s:%s %s', "\t", $bucket->doc_count - 1, $argField, $bucket->key, PHP_EOL);
$r = queryES('DELETE', '_query?' . http_build_query([
'q' => sprintf('%s:%s AND NOT _id:%s', $argField, $bucket->key, $bucket->doc->hits->hits[0]->_id)
]));
}
sleep(1 + $argPauseTime);
}
function queryES($method, $service, $data = null)
{
global $argHost, $argIndex;
$ch = curl_init(sprintf("http://%s:9200/%s/%s", $argHost, $argIndex, $service));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
if (!empty($data)) {
curl_setopt($ch, CURLOPT_POSTFIELDS, is_string($data) ? $data : json_encode($data));
}
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);
$rawResponse = curl_exec($ch);
$response = json_decode($rawResponse);
curl_close($ch);
return $response;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment