Created
August 23, 2012 10:06
-
-
Save benoit-intrw/3435048 to your computer and use it in GitHub Desktop.
Elasticsearch: test analyser for text like 'R&D' or 'Canal+'
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Script and configuration to test analyser | |
# | |
# Check arguments | |
hostname=$1 | |
indexname=$2 | |
if [ -z "$hostname" ] || [ -z "$indexname" ] | |
then | |
echo "Usage: test_analyzer hostname indexname" | |
echo " with:" | |
echo " - hostname: IP or hostname (port 9200 is added)" | |
echo " - indexname: name of the index (example test)" | |
exit 1 | |
fi | |
# Test string ; note: le HEREDOC permet de teste tous les caractères difficile à echaper en bash | |
teststring=$( cat <<EOF | |
Canal+ annonce 3 ans de R&D ! | |
EOF | |
) | |
# Build URI | |
baseuri="http://$hostname:9200/$indexname" | |
# Confirm before delete | |
# | |
echo -n "Index $baseuri will be deleted. Continue ? [yn] " | |
read are_you_sure | |
if [ "$are_you_sure" != "y" ] | |
then | |
echo "Cancelled!" | |
exit 2 | |
fi | |
# Delete index | |
echo -n "Deleting ... " | |
curl -XDELETE $baseuri/ | |
echo "" | |
# Load settings | |
echo -n "Loading settings ... " | |
curl -XPUT "$baseuri/" -d ' | |
{ | |
"settings": { | |
"index": { | |
"analysis": { | |
"analyzer": { | |
"francais": { | |
"filter": [ | |
"lowercase", | |
"stop_francais", | |
"fr_stemmer", | |
"asciifolding", | |
"elision" | |
], | |
"tokenizer": "standard", | |
"type": "custom" | |
} , | |
"test1": { | |
"tokenizer": "standard", | |
"type": "custom" | |
}, | |
"test2": { | |
"tokenizer": "whitespace", | |
"type": "custom" | |
}, | |
"test3": { | |
"filter": [ | |
"lowercase", | |
"stop_francais", | |
"fr_stemmer", | |
"asciifolding", | |
"elision" | |
], | |
"tokenizer": "whitespace", | |
"type": "custom" | |
}, | |
"test4": { | |
"filter": [ | |
"lowercase", | |
"stop_francais", | |
"fr_stemmer", | |
"asciifolding", | |
"elision" | |
], | |
"tokenizer": "standard", | |
"char_filter" : ["my_mapping"], | |
"type": "custom" | |
} | |
}, | |
"filter": { | |
"elision": { | |
"articles": [ "l", "m", "t", "qu", "n", "s", "j", "d" ], | |
"type": "elision" | |
}, | |
"fr_stemmer": { | |
"name": "french", | |
"type": "stemmer" | |
}, | |
"stop_francais": { | |
"stopwords": [ | |
"_french_" | |
], | |
"type": "stop" | |
} | |
}, | |
"char_filter" : { | |
"my_mapping" : { | |
"type" : "mapping", | |
"mappings" : ["&=>et", "+=>plus"] | |
} | |
} | |
}, | |
"number_of_replicas": 0, | |
"number_of_shards": 1 | |
} | |
} | |
}' | |
echo "" | |
echo "" | |
echo "Test string : $teststring" | |
echo "" | |
echo "Analyzer custom français (francais)" | |
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=francais" -d "$teststring" | |
echo "" | |
echo "Tokenizer standard seul (test1)" | |
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test1" -d "$teststring" | |
echo "" | |
echo "Tokenizer whitespace seul (test2)" | |
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test2" -d "$teststring" | |
echo "" | |
echo "Analyzer custom français avec tokenizer whitespace (test3)" | |
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test3" -d "$teststring" | |
echo "" | |
echo "Analyzer custom français avec tokenizer standard et mapping (test4)" | |
curl -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=test4" -d "$teststring" | |
echo "" | |
# End. | |
echo "Done!" | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment