Created
November 30, 2017 18:35
-
-
Save wsalesky/4c7758f1fc6b1c416139d1b3151a22f6 to your computer and use it in GitHub Desktop.
An edit to eXist-db's NoDiacriticsStandardAnalyzer.java (https://github.com/eXist-db/exist/blob/9b0876bf8bdab0c67ca5e9e7e0b954022f8fda07/extensions/indexes/lucene/src/org/exist/indexing/lucene/analyzers/NoDiacriticsStandardAnalyzer.java) to use ICUFoldingFilter, this works better for Syriac searching.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* eXist Open Source Native XML Database | |
* Copyright (C) 2001-2015 The eXist Project | |
* http://exist-db.org | |
* | |
* This program is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU Lesser General Public License | |
* as published by the Free Software Foundation; either version 2 | |
* of the License, or (at your option) any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public | |
* License along with this library; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | |
*/ | |
package org.exist.indexing.lucene.analyzers; | |
import org.apache.lucene.analysis.*; | |
import org.apache.lucene.analysis.icu.*; | |
import org.apache.lucene.analysis.core.LowerCaseFilter; | |
import org.apache.lucene.analysis.core.StopAnalyzer; | |
import org.apache.lucene.analysis.core.StopFilter; | |
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; | |
import org.apache.lucene.analysis.standard.StandardFilter; | |
import org.apache.lucene.analysis.standard.StandardTokenizer; | |
import org.apache.lucene.analysis.util.CharArraySet; | |
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; | |
import org.apache.lucene.analysis.util.WordlistLoader; | |
import org.apache.lucene.util.Version; | |
import org.exist.indexing.lucene.LuceneIndex; | |
import java.io.IOException; | |
import java.io.Reader; | |
/** | |
* Updated by Winona Salesky to use ICUFoldingFilter instead of ASCIIFoldingFilter. Useful for Syriac, possibly also Arabic and Hebrew. | |
* Requires eXist-db 3.6.0 > | |
* A copy of StandardAnalyzer using an additional ASCIIFoldingFilter to | |
* strip diacritics. | |
*/ | |
public class NoDiacriticsStandardAnalyzer extends StopwordAnalyzerBase { | |
/** Default maximum allowed token length */ | |
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; | |
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; | |
/** | |
* Specifies whether deprecated acronyms should be replaced with HOST type. | |
* See {@linkplain "https://issues.apache.org/jira/browse/LUCENE-1068"} | |
*/ | |
private final boolean replaceInvalidAcronym; | |
/** An unmodifiable set containing some common English words that are usually not | |
useful for searching. */ | |
public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; | |
/** Builds an analyzer with the given stop words. | |
* @param stopWords stop words | |
*/ | |
public NoDiacriticsStandardAnalyzer(final CharArraySet stopWords) { | |
super(stopWords); | |
replaceInvalidAcronym = true; | |
} | |
/** Builds an analyzer with the given stop words. | |
* @param matchVersion Lucene version to match See {@link | |
* <a href="#version">above</a>} | |
* @param stopWords stop words | |
* | |
* @deprecated Use {@link #NoDiacriticsStandardAnalyzer(CharArraySet)} | |
*/ | |
@Deprecated | |
public NoDiacriticsStandardAnalyzer(final Version matchVersion, final CharArraySet stopWords) { | |
super(matchVersion, stopWords); | |
replaceInvalidAcronym = matchVersion.onOrAfter(LuceneIndex.LUCENE_VERSION_IN_USE); | |
} | |
/** | |
/** Builds an analyzer with the default stop words ({@link | |
* #STOP_WORDS_SET}). | |
*/ | |
protected NoDiacriticsStandardAnalyzer() { | |
this((CharArraySet)null); | |
} | |
/** Builds an analyzer with the default stop words ({@link | |
* #STOP_WORDS_SET}). | |
* @param matchVersion Lucene version to match See {@link | |
* <a href="#version">above</a>} | |
* | |
* @deprecated Use {@link #NoDiacriticsStandardAnalyzer()} | |
*/ | |
@Deprecated | |
public NoDiacriticsStandardAnalyzer(final Version matchVersion) { | |
this(matchVersion, STOP_WORDS_SET); | |
} | |
/** Builds an analyzer with the stop words from the given reader. | |
* @see WordlistLoader#getWordSet(Reader, Version) | |
* @param matchVersion Lucene version to match See {@link | |
* <a href="#version">above</a>} | |
* @param stopwords Reader to read stop words from */ | |
public NoDiacriticsStandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException { | |
this(matchVersion, WordlistLoader.getWordSet(stopwords, matchVersion)); | |
} | |
/** | |
* Set maximum allowed token length. If a token is seen | |
* that exceeds this length then it is discarded. This | |
* setting only takes effect the next time tokenStream or | |
* reusableTokenStream is called. | |
*/ | |
public void setMaxTokenLength(int length) { | |
maxTokenLength = length; | |
} | |
/** | |
* @see #setMaxTokenLength | |
*/ | |
public int getMaxTokenLength() { | |
return maxTokenLength; | |
} | |
@Override | |
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { | |
final StandardTokenizer src = new StandardTokenizer(getVersion(), reader); | |
src.setMaxTokenLength(maxTokenLength); | |
// src.setReplaceInvalidAcronym(replaceInvalidAcronym); | |
TokenStream tok = new StandardFilter(getVersion(), src); | |
tok = new ICUFoldingFilter(tok); | |
tok = new LowerCaseFilter(getVersion(), tok); | |
tok = new StopFilter(getVersion(), tok, stopwords); | |
return new TokenStreamComponents(src, tok); | |
// { | |
// @Override | |
// protected boolean reset(final Reader reader) throws IOException { | |
// src.setMaxTokenLength(NoDiacriticsStandardAnalyzer.this.maxTokenLength); | |
// return super.reset(reader); | |
// } | |
// }; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment