Skip to content

Instantly share code, notes, and snippets.

@ferronrsmith
Created August 17, 2019 03:59
Show Gist options
  • Select an option

  • Save ferronrsmith/6e3b61de9f79117cc0b6f47d13ed972f to your computer and use it in GitHub Desktop.

Select an option

Save ferronrsmith/6e3b61de9f79117cc0b6f47d13ed972f to your computer and use it in GitHub Desktop.
test French Stemmers
#!/bin/bash
# Script and configuration to test stemmer
#
# Check arguments
hostname=$1
indexname=$2
if [ -z "$hostname" ] || [ -z "$indexname" ]
then
echo "Usage: test_stemmer hostname indexname"
echo " with:"
echo " - hostname: IP or hostname (port 9200 is added)"
echo " - indexname: name of the index (example test)"
exit 1
fi
# Test string ; note: le HEREDOC permet de tester tous les caractères difficile à echaper en bash
teststring=$( cat <<EOF
En sa totalité l'archive n'est pas descriptible, et elle est incontournable en son actualité.\" - Michel Foucault, L'Archéologie du Savoir La Martinique est une île faisant partie de l'archipel des Antilles, elle est située dans la mer des Caraïbes. En linguistique, la racinisation (ou désuffixation, ou stemming en anglais) est le nom donné au procédé qui vise à transformer les flexions en leur radical ou stemme.
EOF
)
# Build URI
baseuri="http://$hostname:9200/$indexname"
# Confirm before delete
#
echo -n "Index $baseuri will be deleted. Continue ? [yn] "
read are_you_sure
if [ "$are_you_sure" != "y" ]
then
echo "Cancelled!"
exit 2
fi
# Delete index
echo -n "Deleting ... "
curl -XDELETE $baseuri/
echo ""
# Load settings
echo -n "Loading settings ... "
curl -XPUT "$baseuri/" -H 'Content-Type: application/json' -d '
{
"settings": {
"index": {
"analysis": {
"analyzer": {
"french_stemmer": {
"filter": [
"french_stemmer"
],
"tokenizer": "standard",
"type": "custom"
} ,
"light_french_stemmer": {
"filter": [
"light_french_stemmer"
],
"tokenizer": "standard",
"type": "custom"
} ,
"minimal_french_stemmer": {
"filter": [
"minimal_french_stemmer"
],
"tokenizer": "standard",
"type": "custom"
}
},
"filter": {
"french_stemmer": {
"name": "french",
"type": "stemmer"
},
"light_french_stemmer": {
"name": "light_french",
"type": "stemmer"
},
"minimal_french_stemmer": {
"name": "minimal_french",
"type": "stemmer"
}
}
},
"number_of_replicas": 0,
"number_of_shards": 1
}
}
}'
echo ""
echo ""
echo "Test string : $teststring"
echo ""
echo "Stemmer french"
curl -s -XGET "$hostname:9200/$indexname/_analyze" -H 'Content-Type: application/json' -d "
{
\"analyzer\" : \"french_stemmer\",
\"text\" : \"$teststring\"
}
" | jq .
echo ""
echo "Stemmer light_french_stemmer"
curl -s -XGET "$hostname:9200/$indexname/_analyze" -H 'Content-Type: application/json' -d "
{
\"analyzer\" : \"light_french_stemmer\",
\"text\" : \"$teststring\"
}
" | jq .
echo ""
echo "Stemmer minimal_french_stemmer"
curl -s -XGET "$hostname:9200/$indexname/_analyze" -H 'Content-Type: application/json' -d "
{
\"analyzer\" : \"minimal_french_stemmer\",
\"text\" : \"$teststring\"
}
" | jq .
# echo ""
# End.
echo "Done!"
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment