Created
December 17, 2010 03:33
-
-
Save thoward/744444 to your computer and use it in GitHub Desktop.
An example of how to do something like Solr's copy fields in a Lucene Index...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using Lucene.Net.Analysis; | |
using Lucene.Net.Analysis.Standard; | |
using Lucene.Net.Documents; | |
using Lucene.Net.Index; | |
using Lucene.Net.QueryParsers; | |
using Lucene.Net.Search; | |
using Lucene.Net.Store; | |
namespace MultiTokenStreamExample | |
{ | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
const string text_content = "This is some content. 123 is a number. 456 is also a number."; | |
const string numeric_content = "789 this is normal text"; | |
var standard = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29); | |
var numeric = new NumericAnalyzer(); | |
var perField = new PerFieldAnalyzerWrapper(standard); | |
perField.AddAnalyzer("numeric_content", numeric); | |
// typical index creation | |
var directory = new RAMDirectory(); | |
var writer = new IndexWriter(directory, standard, IndexWriter.MaxFieldLength.UNLIMITED); | |
var document = new Document(); | |
var text_content_field = new Field("text_content", text_content, Field.Store.YES, Field.Index.ANALYZED); | |
var id_field = new Field("numeric_content", numeric_content, Field.Store.YES, Field.Index.ANALYZED); | |
document.Add(text_content_field); | |
document.Add(id_field); | |
// special sauce | |
var combined_field = | |
new MultiField("combined", | |
new List<Fieldable> { text_content_field, id_field}, | |
perField); | |
document.Add(combined_field); | |
writer.AddDocument(document); | |
writer.Optimize(); | |
writer.Close(); | |
// test searching against our combined field... | |
Searcher searcher = new IndexSearcher(directory, true); | |
// hits on first field | |
SearchCombined(searcher, "number"); | |
// hits on first field | |
SearchCombined(searcher, "123"); | |
// hits on second field | |
SearchCombined(searcher, "789"); | |
// does not hit on second field because numeric analyzer skips that content. | |
SearchCombined(searcher, "normal"); | |
Console.ReadKey(); | |
} | |
private static void SearchCombined(Searcher searcher, string queryString) | |
{ | |
// Build a Query object | |
QueryParser parser = new QueryParser("combined", new StandardAnalyzer()); | |
Query query = parser.Parse(queryString); | |
// Search for the query | |
Hits hits = searcher.Search(query); | |
// Examine the Hits object to see if there were any matches | |
int hitCount = hits.Length(); | |
if (hitCount == 0) { | |
Console.WriteLine("No matches were found for \"" + queryString + "\""); | |
} | |
else { | |
Console.WriteLine("Hits for \"" + queryString + "\" were found in quotes by:"); | |
// Iterate over the Documents in the Hits object | |
for (int i = 0; i < hitCount; i++) { | |
Document doc = hits.Doc(i); | |
// Print the value that we stored in the "title" field. Note | |
// that this Field was not indexed, but (unlike the | |
// "contents" field) was stored verbatim and can be | |
// retrieved. | |
Console.WriteLine(" " + (i + 1) + "] [id: " +doc.Get("id") + "] [text_content: " + doc.Get("text_content") + "]"); | |
} | |
} | |
Console.WriteLine(); | |
} | |
} | |
public class MultiField : Fieldable | |
{ | |
public MultiField(string name, List<Fieldable> fields, PerFieldAnalyzerWrapper analyzerWrapper) | |
{ | |
_name = name; | |
_fields = fields; | |
_analyzerWrapper = analyzerWrapper; | |
} | |
private string _name; | |
private List<Fieldable> _fields; | |
private PerFieldAnalyzerWrapper _analyzerWrapper; | |
#region Fieldable Members | |
public void SetBoost(float boost) | |
{ | |
//throw new Exception("The method or operation is not implemented."); | |
} | |
public float GetBoost() | |
{ | |
return 1; | |
} | |
public string Name() | |
{ | |
return _name; | |
} | |
public string StringValue() | |
{ | |
throw new Exception("The method or operation is not implemented."); | |
//StringBuilder sb = new StringBuilder(); | |
//foreach (var f in _fields) | |
// sb.AppendLine(f.StringValue()); | |
//return sb.ToString(); | |
} | |
public System.IO.TextReader ReaderValue() | |
{ | |
throw new Exception("The method or operation is not implemented."); | |
//var combined = default(TextReader); | |
//foreach (var f in _fields) | |
// combined = | |
// combined == default(TextReader) | |
// ? f.ReaderValue() | |
// : combined.Union(f.ReaderValue()); | |
//return combined; | |
} | |
public byte[] BinaryValue() | |
{ | |
throw new Exception("The method or operation is not implemented."); | |
//var ms = new MemoryStream(); | |
//foreach (var f in _fields) | |
//{ | |
// var bytes = f.BinaryValue(); | |
// ms.Write(bytes, 0, 0); | |
//} | |
//return ms.ToArray(); | |
} | |
public TokenStream TokenStreamValue() | |
{ | |
return new MultiTokenStream(_fields.Select(a => GetTokenStream(a))); | |
} | |
private TokenStream GetTokenStream(Fieldable f) | |
{ | |
return | |
f.TokenStreamValue() | |
?? _analyzerWrapper.TokenStream(f.Name(), new StringReader(f.StringValue())); | |
} | |
public bool IsStored() | |
{ | |
return false; | |
} | |
public bool IsIndexed() | |
{ | |
return true; | |
} | |
public bool IsTokenized() | |
{ | |
return true; | |
} | |
public bool IsCompressed() | |
{ | |
return false; | |
} | |
public bool IsTermVectorStored() | |
{ | |
return true; | |
} | |
public bool IsStoreOffsetWithTermVector() | |
{ | |
return true; | |
} | |
public bool IsStorePositionWithTermVector() | |
{ | |
return true; | |
} | |
public bool IsBinary() | |
{ | |
foreach (var f in _fields) | |
if (!f.IsBinary()) return false; | |
return true; | |
} | |
public bool GetOmitNorms() | |
{ | |
foreach (var f in _fields) | |
if (!f.GetOmitNorms()) return false; | |
return true; | |
} | |
public void SetOmitNorms(bool omitNorms) | |
{ | |
throw new Exception("The method or operation is not implemented."); | |
} | |
public void SetOmitTf(bool omitTf) | |
{ | |
throw new Exception("The method or operation is not implemented."); | |
} | |
public bool GetOmitTf() | |
{ | |
foreach (var f in _fields) | |
if (!f.GetOmitTf()) return false; | |
return true; | |
} | |
public bool IsLazy() | |
{ | |
//foreach (var f in _fields) | |
// if (!f.GetOmitNorms()) return false; | |
return true; | |
} | |
public int GetBinaryOffset() | |
{ | |
return 0; | |
} | |
public int GetBinaryLength() | |
{ | |
return BinaryValue().Length; | |
} | |
public byte[] GetBinaryValue() | |
{ | |
return BinaryValue(); | |
} | |
public byte[] GetBinaryValue(byte[] result) | |
{ | |
throw new Exception("The method or operation is not implemented."); | |
} | |
#endregion | |
} | |
// this is just an example to show a different kind of token stream.. | |
public class NumericAnalyzer : Analyzer | |
{ | |
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) | |
{ | |
return new NumericTokenizer(reader); | |
} | |
} | |
public class NumericTokenizer : CharTokenizer | |
{ | |
public NumericTokenizer(TextReader input) | |
: base(input) | |
{ | |
} | |
protected override bool IsTokenChar(char c) | |
{ | |
// only consider numbers to be tokenizable characters | |
return char.IsNumber(c); | |
} | |
} | |
public class MultiTokenStream : TokenStream | |
{ | |
public MultiTokenStream(IEnumerable<TokenStream> tokenStreams) | |
{ | |
_tokenStreams = new List<TokenStream>(tokenStreams); | |
} | |
private List<TokenStream> _tokenStreams; | |
private IEnumerator<TokenStream> _tokenStreamEnumerator; | |
private TokenStream _currentTokenStream; | |
public override void ClearAttributes() | |
{ | |
base.ClearAttributes(); | |
foreach (TokenStream tokenStream in _tokenStreams) | |
tokenStream.ClearAttributes(); | |
} | |
public override bool IncrementToken() | |
{ | |
if (_tokenStreamEnumerator == null) | |
_tokenStreamEnumerator = _tokenStreams.GetEnumerator(); | |
if (_currentTokenStream == null) | |
{ | |
if (!_tokenStreamEnumerator.MoveNext()) | |
return false; | |
_currentTokenStream = _tokenStreamEnumerator.Current; | |
} | |
bool success = _currentTokenStream.IncrementToken(); | |
base.RestoreState(_currentTokenStream.CaptureState()); | |
if (!success) | |
{ | |
if (!_tokenStreamEnumerator.MoveNext()) return false; | |
_currentTokenStream = _tokenStreamEnumerator.Current; | |
return true; | |
} | |
return true; | |
} | |
public override void Reset() | |
{ | |
if(null != _tokenStreamEnumerator) | |
_tokenStreamEnumerator.Reset(); | |
_currentTokenStream = null; | |
} | |
} | |
//// FROM: http://stackoverflow.com/questions/2925652/how-to-string-multiple-textreaders-together | |
//public static class Extensions | |
//{ | |
// public static TextReader Union(this TextReader first, TextReader second) | |
// { | |
// return new ChainedTextReader(first, second); | |
// } | |
// private class ChainedTextReader : TextReader | |
// { | |
// private TextReader first; | |
// private TextReader second; | |
// private bool readFirst = true; | |
// public ChainedTextReader(TextReader first, TextReader second) | |
// { | |
// this.first = first; | |
// this.second = second; | |
// } | |
// public override int Peek() | |
// { | |
// if (readFirst) | |
// { | |
// return first.Peek(); | |
// } | |
// else | |
// { | |
// return second.Peek(); | |
// } | |
// } | |
// public override int Read() | |
// { | |
// if (readFirst) | |
// { | |
// int value = first.Read(); | |
// if (value == -1) | |
// { | |
// readFirst = false; | |
// } | |
// else | |
// { | |
// return value; | |
// } | |
// } | |
// return second.Read(); | |
// } | |
// public override void Close() | |
// { | |
// first.Close(); | |
// second.Close(); | |
// } | |
// protected override void Dispose(bool disposing) | |
// { | |
// base.Dispose(disposing); | |
// if (disposing) | |
// { | |
// first.Dispose(); | |
// second.Dispose(); | |
// } | |
// } | |
// } | |
//} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment