Created
July 24, 2020 10:17
-
-
Save insekticid/452fc7e5619d3a2dff2d9c6d924c50b9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sitemapModule from 'sitemap'; | |
import axios from 'axios'; | |
const { buildSitemapIndex, createSitemapsAndIndex } = sitemapModule; | |
let elasticsearch = axios.create({ | |
baseURL: 'http://' + process.env.ELASTIC_SERVER + ':' + process.env.ELASTIC_PORT, | |
responseType: 'json' | |
}); | |
elasticsearch.interceptors.request.use(request => { | |
//console.log('Starting Request', request) | |
return request | |
}) | |
elasticsearch.interceptors.response.use(response => { | |
//console.log('Response:', response) | |
return response | |
}) | |
let elasticData = (field) => { | |
return { | |
"_source": { | |
"includes": [ "name", "url" ] | |
}, | |
"size": 0, | |
"aggs": { | |
"group_by_state": { | |
"terms": { | |
"field": field, | |
"size": 500000 | |
} | |
} | |
} | |
} | |
} | |
const sitemapPaths = []; | |
let createIndex = (urls) => createSitemapsAndIndex({ | |
urls: urls, | |
lastmod: new Date().toISOString(), | |
targetFolder: 'sitemap-search', | |
hostname: process.env.SITEMAP_URL, | |
cacheTime: 600, | |
sitemapName: 'sitemap', | |
sitemapSize: 10000, // number of urls to allow in each sitemap | |
gzip: true, // whether to gzip the files | |
}) | |
let prepareUrl = (bucket) => { | |
sitemapPaths.push( { url: process.env.SITEMAP_KEY_URL + bucket.key }); | |
} | |
let getData = async() => { | |
await elasticsearch.post(process.env.ELASTIC_INDEX + '/_search', elasticData('name.keyword')) | |
.then(async response => | |
{ | |
let buckets = response.data.aggregations.group_by_state.buckets; | |
await Promise.all(buckets.map(prepareUrl)); | |
console.log('done keyword', buckets.length, sitemapPaths.length) | |
}) | |
await elasticsearch.post(process.env.ELASTIC_INDEX + '/_search', elasticData('name.analyzed')) | |
.then(async response => | |
{ | |
let buckets = response.data.aggregations.group_by_state.buckets; | |
await Promise.all(buckets.map(prepareUrl)); | |
console.log('done analyzed', buckets.length, sitemapPaths.length) | |
}) | |
await createIndex(sitemapPaths); | |
console.log('done all', sitemapPaths.length) | |
} | |
getData() |
Author
insekticid
commented
Jul 24, 2020
POST content/_search
{
"size": 0,
"aggs": {
"group_by_state": {
"terms": {
"field": "name.analyzed",
"size": 1000
}
}
}
}
{
"took" : 548,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 104701,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"group_by_state" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "salát",
"doc_count" : 5365
},
{
"key" : "kuřecí",
"doc_count" : 5178
},
{
"key" : "polévka",
"doc_count" : 4902
},
{
"key" : "pečený",
"doc_count" : 4086
},
{
"key" : "koláč",
"doc_count" : 3888
},
{
"key" : "omáčka",
"doc_count" : 3833
},
{
"key" : "vepřový",
"doc_count" : 3155
},
{
"key" : "maso",
"doc_count" : 3097
},
GET /recipe,recipes/_search
{
"_source": {
"includes": [ "name", "url" ]
},
"size": 0,
"aggs": {
"group_by_state": {
"terms": {
"field": "name.keyword",
"size": 3000
}
}
}
}
{
"took" : 3137,
"timed_out" : false,
"_shards" : {
"total" : 10,
"successful" : 10,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 125593,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"group_by_state" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "perníčky",
"doc_count" : 23
},
{
"key" : "vanilkové+rohlíčky",
"doc_count" : 23
},
{
"key" : "tiramisu",
"doc_count" : 22
},
{
"key" : "kuře+na+paprice",
"doc_count" : 16
},
{
"key" : "čokoládová+pěna",
"doc_count" : 16
},
{
"key" : "bramborové+knedlíky",
"doc_count" : 15
},
{
"key" : "polévka+z+červené+čočky",
"doc_count" : 15
},
{
"key" : "vánočka",
"doc_count" : 15
},
{
"key" : "brokolicová+polévka",
"doc_count" : 14
},
GET /recipe,recipes/_search
{
"_source": {
"includes": [ "name", "url" ]
},
"query": {
"bool": {
"should": [
{
"match": {
"name": {
"query": "svarak",
"fuzziness": 1
}
}
}
]
}
},
"indices_boost": [
{
"recipe": 1.8
}
],
"from": 0,
"size": 48
}
{
"took" : 9,
"timed_out" : false,
"_shards" : {
"total" : 10,
"successful" : 10,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 20,
"max_score" : 17.028666,
"hits" : [
{
"_index" : "recipe",
"_type" : "recipe",
"_id" : "4278",
"_score" : 17.028666,
"_source" : {
"name" : "Medový svařák",
"url" : "https://www.recepty.eu/teple-napoje/medovy-svarak.html"
}
},
{
"_index" : "recipe",
"_type" : "recipe",
"_id" : "15094",
"_score" : 14.939789,
"_source" : {
"name" : "Svařák s jablkem a hruškou",
"url" : "https://www.recepty.eu/teple-napoje/svarak-s-jablkem-a-hruskou.html"
}
},
<?php
declare(strict_types=1);
/*
* This file is part of Recepty.eu project
* (c) Exploit.cz <[email protected]>
* (c) Recepty.eu <[email protected]>
*
* This source file is subject to the proprietary license.
*/
namespace App\Repository;
use App\Entity\Category;
use Elastica\Query;
use Elastica\Query\BoolQuery;
use Elastica\Query\Match;
use Elastica\Query\Terms;
use FOS\ElasticaBundle\Repository;
class SearchRepository extends Repository
{
use PagingTrait;
public function search(string $searchTerm, ?Category $category = null, int $page = 1, int $limit = 48) : ?array
{
if ($searchTerm) {
$boolQuery = $this->baseQuery($searchTerm, $category);
$query = Query::create($boolQuery);
$query->setParam('indices_boost', [['recipe' => 1.8]]);
$items = $this->findPaginated($query);
$items->setMaxPerPage($limit);
$items->setCurrentPage($page);
return ['items' => $items, 'searchTerm' => $searchTerm];
}
return null;
}
protected function baseQuery(string $searchTerm, ?Category $category = null) : BoolQuery
{
$boolQuery = new BoolQuery();
$fieldQuery = new Match();
$fieldQuery->setFieldQuery('name', $searchTerm);
$fieldQuery->setFieldFuzziness('name', 1);
$boolQuery->addShould($fieldQuery);
if ($category) {
$categoryQuery = new Terms();
$categoryQuery->setTerms('category', [$category->getId()]);
$boolQuery->addMust($categoryQuery);
}
return $boolQuery;
}
}
<?php
declare(strict_types=1);
/*
* This file is part of Recepty.eu project
* (c) Exploit.cz <[email protected]>
* (c) Recepty.eu <[email protected]>
*
* This source file is subject to the proprietary license.
*/
namespace App\Repository;
use App\Util\LimitedPagerfanta;
use Doctrine\ORM\Query;
use Doctrine\ORM\QueryBuilder;
use Pagerfanta\Adapter\DoctrineORMAdapter;
use Pagerfanta\Pagerfanta;
trait PagingTrait
{
protected function createPaginator(Query $query, int $limit, int $page, ?int $limitedPageNumber = null) : Pagerfanta
{
$adapter = new DoctrineORMAdapter($query);
if ($limitedPageNumber) {
$pager = new LimitedPagerfanta($adapter);
$pager->setLimitedPageNumber($limitedPageNumber);
} else {
$pager = new Pagerfanta($adapter);
}
$pager->setMaxPerPage($limit);
$pager->setCurrentPage($page);
return $pager;
}
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment