Created
September 22, 2009 07:55
-
-
Save jcbozonier/190915 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System; | |
| using System.Collections.Generic; | |
| using System.Linq; | |
| namespace FullTextSearch | |
| { | |
| public class TextSearch | |
| { | |
| private readonly AggregatedTextRepository _TextRepository; | |
| private readonly TextVectorBuilder _VectorBuilder; | |
| private readonly FullTextSearchEngine _FullTextSearchEngine; | |
| public TextSearch(IEnumerable<string> corpus) | |
| { | |
| var counter = 0; | |
| var aggregator = new TextAggregator(); | |
| foreach(var line in corpus) | |
| { | |
| var lineOfText = new LineOfText(counter++, line); | |
| aggregator.Add(lineOfText); | |
| } | |
| _TextRepository = new AggregatedTextRepository(aggregator); | |
| _VectorBuilder = new TextVectorBuilder(_TextRepository); | |
| _FullTextSearchEngine = new FullTextSearchEngine(_TextRepository, _VectorBuilder); | |
| } | |
| public TextSearchResults Search(string text) | |
| { | |
| var textVector = _VectorBuilder.BuildFor(new LineOfText(-1, text)); | |
| var closestVector = _FindClosestVectorTo(textVector); | |
| return new TextSearchResults | |
| { | |
| BestMatchLineNumber = closestVector.LineNumber, | |
| BestMatchText = closestVector.Text | |
| }; | |
| } | |
| private TextVector _FindClosestVectorTo(TextVector vector) | |
| { | |
| var closestVector = _FullTextSearchEngine.FindClosestVectorTo(vector); | |
| return closestVector; | |
| } | |
| } | |
| internal class FullTextSearchEngine | |
| { | |
| private readonly AggregatedTextRepository AggregatedTextRepository; | |
| private readonly TextVectorBuilder TextVectorBuilder; | |
| private readonly List<TextVector> TextVectors; | |
| public FullTextSearchEngine(AggregatedTextRepository aggregatedTextRepository, TextVectorBuilder textVectorBuilder) | |
| { | |
| AggregatedTextRepository = aggregatedTextRepository; | |
| TextVectorBuilder = textVectorBuilder; | |
| TextVectors = new List<TextVector>(); | |
| foreach (var lineOfText in AggregatedTextRepository.GetAll()) | |
| { | |
| TextVectors.Add(TextVectorBuilder.BuildFor(lineOfText)); | |
| } | |
| } | |
| public TextVector FindClosestVectorTo(TextVector vector) | |
| { | |
| var minimum = -1.0; | |
| var chosenIndex = -1; | |
| for (var i = 0; i < TextVectors.Count; i++ ) | |
| { | |
| var testVector = TextVectors[i]; | |
| var currentValue = vector.DistanceTo(testVector); | |
| if (minimum == -1.0 || minimum > currentValue) | |
| { | |
| chosenIndex = i; | |
| minimum = currentValue; | |
| } | |
| } | |
| return chosenIndex >= 0 ? TextVectors[chosenIndex] : null; | |
| } | |
| } | |
| public class TextSearchResults | |
| { | |
| public int BestMatchLineNumber; | |
| public string BestMatchText; | |
| } | |
| internal class TextAggregator | |
| { | |
| private readonly Dictionary<string, int> WordToVectorIndexMap; | |
| private readonly List<LineOfText> LinesOfText; | |
| public TextAggregator() | |
| { | |
| WordToVectorIndexMap = new Dictionary<string, int>(); | |
| LinesOfText = new List<LineOfText>(); | |
| } | |
| public int Count | |
| { | |
| get { return WordToVectorIndexMap.Count; } | |
| } | |
| public void Add(LineOfText line) | |
| { | |
| // Remove punctuation | |
| // Break line into words | |
| // if word doesn't exist in dictionary | |
| // add it with the key=word, value=dictionaryCount | |
| if (LinesOfText.Exists(x => x.LineNumber == line.LineNumber)) | |
| return; | |
| LinesOfText.Add(line); | |
| foreach(var word in line.Words) | |
| if(!WordToVectorIndexMap.ContainsKey(word)) | |
| WordToVectorIndexMap.Add(word, WordToVectorIndexMap.Count); | |
| } | |
| public IEnumerable<LineOfText> GetAll() | |
| { | |
| return LinesOfText; | |
| } | |
| public int GetVectorIndex(string word) | |
| { | |
| if(!WordToVectorIndexMap.ContainsKey(word)) | |
| throw new InvalidOperationException("Every word should be contained in this!"); | |
| return WordToVectorIndexMap[word]; | |
| } | |
| } | |
| internal class AggregatedTextRepository | |
| { | |
| private readonly TextAggregator _Aggregator; | |
| public AggregatedTextRepository(TextAggregator aggregator) | |
| { | |
| _Aggregator = aggregator; | |
| } | |
| public int Count { get { return _Aggregator.Count; } } | |
| public int GetVectorIndex(string word) | |
| { | |
| return _Aggregator.GetVectorIndex(word); | |
| } | |
| public IEnumerable<LineOfText> GetAll() | |
| { | |
| return _Aggregator.GetAll(); | |
| } | |
| } | |
| internal class TextVectorBuilder | |
| { | |
| private readonly AggregatedTextRepository _TextRepository; | |
| public TextVectorBuilder(AggregatedTextRepository textAggregator) | |
| { | |
| _TextRepository = textAggregator; | |
| } | |
| public TextVector BuildFor(LineOfText lineOfText) | |
| { | |
| // This allows us to lazily create the vectors to | |
| // use in our computation so we don't need to allot | |
| // an array of 30,000^2 indices for ints | |
| return new TextVector(lineOfText, _CreateTextVector); | |
| } | |
| private int[] _CreateTextVector(LineOfText x) | |
| { | |
| var vector = new int[_TextRepository.Count]; | |
| foreach (var word in x.Words) | |
| { | |
| var vectorIndex = _TextRepository.GetVectorIndex(word); | |
| vector[vectorIndex]++; | |
| } | |
| return vector; | |
| } | |
| } | |
| public class LineOfText | |
| { | |
| public readonly string[] Words; | |
| public readonly string Text; | |
| public readonly int LineNumber; | |
| public LineOfText(int lineNumber, string text) | |
| { | |
| if(text == null) | |
| throw new ArgumentNullException("text"); | |
| var words = text.GetWords(); | |
| Words = words; | |
| Text = text; | |
| LineNumber = lineNumber; | |
| } | |
| } | |
| internal class TextVector | |
| { | |
| private readonly LineOfText Line; | |
| private readonly Func<LineOfText, int[]> SearchVector; | |
| public string Text { get { return Line.Text; } } | |
| public int LineNumber { get { return Line.LineNumber; } } | |
| public TextVector(LineOfText line, Func<LineOfText, int[]> vectorFactory) | |
| { | |
| Line = line; | |
| SearchVector = vectorFactory; | |
| } | |
| public double DistanceTo(TextVector testVector) | |
| { | |
| var searchVector = SearchVector(Line); | |
| var testSearchVector = testVector.SearchVector(testVector.Line); | |
| if (testSearchVector.Length != searchVector.Length) | |
| throw new InvalidOperationException("The two vectors have different dimensions. This should be impossible!"); | |
| var dimensions = searchVector.Length; | |
| var intermediateSum = 0.0; | |
| for(var i=0; i<dimensions; i++) | |
| { | |
| intermediateSum += Math.Pow(searchVector[i] + testSearchVector[i], 2.0); | |
| } | |
| var distance = Math.Sqrt(intermediateSum); | |
| return distance; | |
| } | |
| } | |
| public static class Helpers | |
| { | |
| public static string[] GetWords(this string text) | |
| { | |
| var filteredLine = String.Join("", text | |
| .Where(x => !char.IsPunctuation(x)).Select(x => x.ToString()) | |
| .ToArray()); | |
| return filteredLine.Split(' '); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment