Skip to content

Instantly share code, notes, and snippets.

@jcbozonier
Created September 22, 2009 07:55
Show Gist options
  • Select an option

  • Save jcbozonier/190915 to your computer and use it in GitHub Desktop.

Select an option

Save jcbozonier/190915 to your computer and use it in GitHub Desktop.
using System;
using System.Collections.Generic;
using System.Linq;
namespace FullTextSearch
{
public class TextSearch
{
private readonly AggregatedTextRepository _TextRepository;
private readonly TextVectorBuilder _VectorBuilder;
private readonly FullTextSearchEngine _FullTextSearchEngine;
public TextSearch(IEnumerable<string> corpus)
{
var counter = 0;
var aggregator = new TextAggregator();
foreach(var line in corpus)
{
var lineOfText = new LineOfText(counter++, line);
aggregator.Add(lineOfText);
}
_TextRepository = new AggregatedTextRepository(aggregator);
_VectorBuilder = new TextVectorBuilder(_TextRepository);
_FullTextSearchEngine = new FullTextSearchEngine(_TextRepository, _VectorBuilder);
}
public TextSearchResults Search(string text)
{
var textVector = _VectorBuilder.BuildFor(new LineOfText(-1, text));
var closestVector = _FindClosestVectorTo(textVector);
return new TextSearchResults
{
BestMatchLineNumber = closestVector.LineNumber,
BestMatchText = closestVector.Text
};
}
private TextVector _FindClosestVectorTo(TextVector vector)
{
var closestVector = _FullTextSearchEngine.FindClosestVectorTo(vector);
return closestVector;
}
}
internal class FullTextSearchEngine
{
private readonly AggregatedTextRepository AggregatedTextRepository;
private readonly TextVectorBuilder TextVectorBuilder;
private readonly List<TextVector> TextVectors;
public FullTextSearchEngine(AggregatedTextRepository aggregatedTextRepository, TextVectorBuilder textVectorBuilder)
{
AggregatedTextRepository = aggregatedTextRepository;
TextVectorBuilder = textVectorBuilder;
TextVectors = new List<TextVector>();
foreach (var lineOfText in AggregatedTextRepository.GetAll())
{
TextVectors.Add(TextVectorBuilder.BuildFor(lineOfText));
}
}
public TextVector FindClosestVectorTo(TextVector vector)
{
var minimum = -1.0;
var chosenIndex = -1;
for (var i = 0; i < TextVectors.Count; i++ )
{
var testVector = TextVectors[i];
var currentValue = vector.DistanceTo(testVector);
if (minimum == -1.0 || minimum > currentValue)
{
chosenIndex = i;
minimum = currentValue;
}
}
return chosenIndex >= 0 ? TextVectors[chosenIndex] : null;
}
}
public class TextSearchResults
{
public int BestMatchLineNumber;
public string BestMatchText;
}
internal class TextAggregator
{
private readonly Dictionary<string, int> WordToVectorIndexMap;
private readonly List<LineOfText> LinesOfText;
public TextAggregator()
{
WordToVectorIndexMap = new Dictionary<string, int>();
LinesOfText = new List<LineOfText>();
}
public int Count
{
get { return WordToVectorIndexMap.Count; }
}
public void Add(LineOfText line)
{
// Remove punctuation
// Break line into words
// if word doesn't exist in dictionary
// add it with the key=word, value=dictionaryCount
if (LinesOfText.Exists(x => x.LineNumber == line.LineNumber))
return;
LinesOfText.Add(line);
foreach(var word in line.Words)
if(!WordToVectorIndexMap.ContainsKey(word))
WordToVectorIndexMap.Add(word, WordToVectorIndexMap.Count);
}
public IEnumerable<LineOfText> GetAll()
{
return LinesOfText;
}
public int GetVectorIndex(string word)
{
if(!WordToVectorIndexMap.ContainsKey(word))
throw new InvalidOperationException("Every word should be contained in this!");
return WordToVectorIndexMap[word];
}
}
internal class AggregatedTextRepository
{
private readonly TextAggregator _Aggregator;
public AggregatedTextRepository(TextAggregator aggregator)
{
_Aggregator = aggregator;
}
public int Count { get { return _Aggregator.Count; } }
public int GetVectorIndex(string word)
{
return _Aggregator.GetVectorIndex(word);
}
public IEnumerable<LineOfText> GetAll()
{
return _Aggregator.GetAll();
}
}
internal class TextVectorBuilder
{
private readonly AggregatedTextRepository _TextRepository;
public TextVectorBuilder(AggregatedTextRepository textAggregator)
{
_TextRepository = textAggregator;
}
public TextVector BuildFor(LineOfText lineOfText)
{
// This allows us to lazily create the vectors to
// use in our computation so we don't need to allot
// an array of 30,000^2 indices for ints
return new TextVector(lineOfText, _CreateTextVector);
}
private int[] _CreateTextVector(LineOfText x)
{
var vector = new int[_TextRepository.Count];
foreach (var word in x.Words)
{
var vectorIndex = _TextRepository.GetVectorIndex(word);
vector[vectorIndex]++;
}
return vector;
}
}
public class LineOfText
{
public readonly string[] Words;
public readonly string Text;
public readonly int LineNumber;
public LineOfText(int lineNumber, string text)
{
if(text == null)
throw new ArgumentNullException("text");
var words = text.GetWords();
Words = words;
Text = text;
LineNumber = lineNumber;
}
}
internal class TextVector
{
private readonly LineOfText Line;
private readonly Func<LineOfText, int[]> SearchVector;
public string Text { get { return Line.Text; } }
public int LineNumber { get { return Line.LineNumber; } }
public TextVector(LineOfText line, Func<LineOfText, int[]> vectorFactory)
{
Line = line;
SearchVector = vectorFactory;
}
public double DistanceTo(TextVector testVector)
{
var searchVector = SearchVector(Line);
var testSearchVector = testVector.SearchVector(testVector.Line);
if (testSearchVector.Length != searchVector.Length)
throw new InvalidOperationException("The two vectors have different dimensions. This should be impossible!");
var dimensions = searchVector.Length;
var intermediateSum = 0.0;
for(var i=0; i<dimensions; i++)
{
intermediateSum += Math.Pow(searchVector[i] + testSearchVector[i], 2.0);
}
var distance = Math.Sqrt(intermediateSum);
return distance;
}
}
public static class Helpers
{
public static string[] GetWords(this string text)
{
var filteredLine = String.Join("", text
.Where(x => !char.IsPunctuation(x)).Select(x => x.ToString())
.ToArray());
return filteredLine.Split(' ');
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment