Skip to content

Instantly share code, notes, and snippets.

@marius92mc
Created May 4, 2015 11:52
Show Gist options
  • Save marius92mc/c5bc77923441e01f33d5 to your computer and use it in GitHub Desktop.
Save marius92mc/c5bc77923441e01f33d5 to your computer and use it in GitHub Desktop.
@Override
protected TokenStreamComponents createComponents(String fieldName)
{
final Tokenizer source = new StandardTokenizer();
TokenStream tokenStream = source;
tokenStream = new StandardFilter(tokenStream);
tokenStream = new LowerCaseFilter(tokenStream);
tokenStream = new StopFilter(tokenStream, getStopwordSet());
//String stopWordsWithoutDiacritics = org.apache.commons.lang3.StringUtils.stripAccents(
// RomanianAnalyzer.getDefaultStopSet().toString());
String defaultStopSet = RomanianAnalyzer.getDefaultStopSet().toString();
tokenStream = new StopFilter(tokenStream,
getRomanianStopWordsWithoutDiacritics(defaultStopSet));
tokenStream = new SnowballFilter(tokenStream, new RomanianStemmer()); // stemmer - flexionar forms
tokenStream = new ASCIIFoldingFilter(tokenStream); // replacing diacritics
//System.out.println(getRomanianStopWordsWithoutDiacritics(defaultStopSet).toString() + " ");
return new TokenStreamComponents(source, tokenStream);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment