-
-
Save korczis/20561d3c49522eab2b689185b425e4ae to your computer and use it in GitHub Desktop.
Čeština v elasticsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# download | |
wget http://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-0.20.4.zip | |
# unzip and start | |
unzip elasticsearch-0.20.4.zip | |
cd elasticsearch-0.20.4 | |
# remove data in case you have defined some analyzers in the past (e.g. stop/start) | |
rm -rf data/ | |
./bin/elasticsearch | |
# give it some time to startup | |
sleep 10 | |
# is it running? | |
curl localhost:9200 | |
# Setup analyzers: | |
curl -X PUT localhost:9200/test -d ' | |
{ | |
"settings" : { | |
"analysis" : { | |
"analyzer" : { | |
"cestina1" : { | |
"type": "czech" | |
}, | |
"cestina2" : { | |
"type" : "custom", | |
"tokenizer" : "standard", | |
"filter" : [ "standard", "lowercase", "czech_stemmer1" ] | |
}, | |
"cestina3" : { | |
"type" : "custom", | |
"tokenizer" : "standard", | |
"filter" : [ "standard", "lowercase", "czech_stemmer2" ] | |
}, | |
"cestina4" : { | |
"type" : "custom", | |
"tokenizer" : "standard", | |
"filter" : [ "standard", "lowercase", "czech_stop", "czech_stemmer2" ] | |
} | |
}, | |
"filter" : { | |
"czech_stemmer1" : { | |
"type" : "stemmer", | |
"name" : "czech" | |
}, | |
"czech_stemmer2" : { | |
"type" : "czech_stem" | |
}, | |
"czech_stop" : { | |
"type" : "stop", | |
"stopwords" : ["_czech_"] | |
} | |
} | |
} | |
} | |
}' | |
# Phrase: "Bankovní poplatky jsou nehorázné" | |
# cestina1: preconfigured czech analyzer | |
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=cestina1&text=Bankovn%C3%AD%20poplatky%20jsou%20nehor%C3%A1zn%C3%A9' | |
# cestina2: custom analyzer using czech stemmer | |
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=cestina2&text=Bankovn%C3%AD%20poplatky%20jsou%20nehor%C3%A1zn%C3%A9' | |
# cestina3: custom analyzer using czech stemmer (using a little shorter notation) | |
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=cestina3&text=Bankovn%C3%AD%20poplatky%20jsou%20nehor%C3%A1zn%C3%A9' | |
# cestina4: Note both cestina2 and cestina3 did not exclude stop words. Let's add czech stopwords list. | |
# Note the custom analyzer is in fact the same to what is preconfigured in cestina1 under the hood. | |
curl 'localhost:9200/test/_analyze?pretty=1&analyzer=cestina4&text=Bankovn%C3%AD%20poplatky%20jsou%20nehor%C3%A1zn%C3%A9' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment