Created
February 22, 2013 16:29
-
-
Save benoit-intrw/5014658 to your computer and use it in GitHub Desktop.
Elasticsearch: test french stemmer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Script and configuration to test stemmer | |
# | |
# Check arguments | |
hostname=$1 | |
indexname=$2 | |
if [ -z "$hostname" ] || [ -z "$indexname" ] | |
then | |
echo "Usage: test_stemmer hostname indexname" | |
echo " with:" | |
echo " - hostname: IP or hostname (port 9200 is added)" | |
echo " - indexname: name of the index (example test)" | |
exit 1 | |
fi | |
# Test string ; note: le HEREDOC permet de tester tous les caractères difficile à echaper en bash | |
teststring=$( cat <<EOF | |
"En sa totalité l'archive n'est pas descriptible, et elle est incontournable en son actualité." - Michel Foucault, L'Archéologie du Savoir | |
La Martinique est une île faisant partie de l'archipel des Antilles, elle est située dans la mer des Caraïbes. | |
En linguistique, la racinisation (ou désuffixation, ou stemming en anglais) est le nom donné au procédé qui vise à transformer les flexions en leur radical ou stemme. | |
EOF | |
) | |
# Build URI | |
baseuri="http://$hostname:9200/$indexname" | |
# Confirm before delete | |
# | |
echo -n "Index $baseuri will be deleted. Continue ? [yn] " | |
read are_you_sure | |
if [ "$are_you_sure" != "y" ] | |
then | |
echo "Cancelled!" | |
exit 2 | |
fi | |
# Delete index | |
echo -n "Deleting ... " | |
curl -XDELETE $baseuri/ | |
echo "" | |
# Load settings | |
echo -n "Loading settings ... " | |
curl -XPUT "$baseuri/" -d ' | |
{ | |
"settings": { | |
"index": { | |
"analysis": { | |
"analyzer": { | |
"french_stemmer": { | |
"filter": [ | |
"french_stemmer" | |
], | |
"tokenizer": "standard", | |
"type": "custom" | |
} , | |
"light_french_stemmer": { | |
"filter": [ | |
"light_french_stemmer" | |
], | |
"tokenizer": "standard", | |
"type": "custom" | |
} , | |
"minimal_french_stemmer": { | |
"filter": [ | |
"minimal_french_stemmer" | |
], | |
"tokenizer": "standard", | |
"type": "custom" | |
} | |
}, | |
"filter": { | |
"french_stemmer": { | |
"name": "french", | |
"type": "stemmer" | |
}, | |
"light_french_stemmer": { | |
"name": "light_french", | |
"type": "stemmer" | |
}, | |
"minimal_french_stemmer": { | |
"name": "minimal_french", | |
"type": "stemmer" | |
} | |
} | |
}, | |
"number_of_replicas": 0, | |
"number_of_shards": 1 | |
} | |
} | |
}' | |
echo "" | |
echo "" | |
echo "Test string : $teststring" | |
echo "" | |
echo "Stemmer french" | |
curl -s -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=french_stemmer" -d "$teststring" | grep '"token"' | cut -d '"' -f 4 | tr "\n" " " | |
echo "" | |
echo "Stemmer light_french_stemmer" | |
curl -s -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=light_french_stemmer" -d "$teststring" | grep '"token"' | cut -d '"' -f 4 | tr "\n" " " | |
echo "" | |
echo "Stemmer minimal_french_stemmer" | |
curl -s -XGET "$hostname:9200/$indexname/_analyze?pretty=true&analyzer=minimal_french_stemmer" -d "$teststring" | grep '"token"' | cut -d '"' -f 4 | tr "\n" " " | |
echo "" | |
# End. | |
echo "Done!" | |
exit 0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment