Last active
January 1, 2016 15:35
-
-
Save madelson/6378650cbf3e0148d5ef to your computer and use it in GitHub Desktop.
.NET implementation of a Collection Equality function to be released in https://github.com/madelson/MedallionUtilities/tree/master/MedallionCollections
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections; | |
using System.Collections.Generic; | |
using System.Diagnostics; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
namespace Medallion.Collections | |
{ | |
public static class CollectionHelper | |
{ | |
#region ---- CollectionEquals ---- | |
/// <summary> | |
/// Determines whether <paramref name="this"/> and <paramref name="that"/> are equal in the sense of having the exact same | |
/// elements. Unlike <see cref="Enumerable.SequenceEqual{TSource}(IEnumerable{TSource}, IEnumerable{TSource})"/>, | |
/// this method disregards order. Unlike <see cref="ISet{T}.SetEquals(IEnumerable{T})"/>, this method does not disregard duplicates. | |
/// An optional <paramref name="comparer"/> allows the equality semantics for the elements to be specified | |
/// </summary> | |
public static bool CollectionEquals<TElement>(this IEnumerable<TElement> @this, IEnumerable<TElement> that, IEqualityComparer<TElement> comparer = null) | |
{ | |
Throw.IfNull(@this, "this"); | |
Throw.IfNull(that, "that"); | |
// FastCount optimization: If both of the collections are materialized and have counts, | |
// we can exit very quickly if those counts differ | |
int thisCount, thatCount; | |
var hasThisCount = TryFastCount(@this, out thisCount); | |
bool hasThatCount; | |
if (hasThisCount) | |
{ | |
hasThatCount = TryFastCount(that, out thatCount); | |
if (hasThatCount) | |
{ | |
if (thisCount != thatCount) | |
{ | |
return false; | |
} | |
if (thisCount == 0) | |
{ | |
return true; | |
} | |
} | |
} | |
else | |
{ | |
hasThatCount = false; | |
} | |
var cmp = comparer ?? EqualityComparer<TElement>.Default; | |
var itemsEnumerated = 0; | |
// SequenceEqual optimization: we reduce/avoid hashing | |
// the collections have common prefixes, at the cost of only one | |
// extra Equals() call in the case where the prefixes are not common | |
using (var thisEnumerator = @this.GetEnumerator()) | |
using (var thatEnumerator = that.GetEnumerator()) | |
{ | |
while (true) | |
{ | |
var thisFinished = !thisEnumerator.MoveNext(); | |
var thatFinished = !thatEnumerator.MoveNext(); | |
if (thisFinished) | |
{ | |
// either this shorter than that, or the two were sequence-equal | |
return thatFinished; | |
} | |
if (thatFinished) | |
{ | |
// that shorter than this | |
return false; | |
} | |
// keep track of this so that we can factor it into count-based | |
// logic below | |
++itemsEnumerated; | |
if (!cmp.Equals(thisEnumerator.Current, thatEnumerator.Current)) | |
{ | |
break; // prefixes were not equal | |
} | |
} | |
// now, build a dictionary of item => count out of one collection and then | |
// probe it with the other collection to look for mismatches | |
// Build/Probe Choice optimization: if we know the count of one collection, we should | |
// use the other collection to build the dictionary. That way we can bail immediately if | |
// we see too few or too many items | |
CountingSet<TElement> elementCounts; | |
IEnumerator<TElement> probeSide; | |
if (hasThisCount) | |
{ | |
// we know this's count => use that as the build side | |
probeSide = thisEnumerator; | |
var remaining = thisCount - itemsEnumerated; | |
if (hasThatCount) | |
{ | |
// if we have both counts, that means they must be equal or we would have already | |
// exited. However, in this case, we know exactly the capacity needed for the dictionary | |
// so we can avoid resizing | |
elementCounts = new CountingSet<TElement>(capacity: remaining, comparer: cmp); | |
do | |
{ | |
elementCounts.Increment(thatEnumerator.Current); | |
} | |
while (thatEnumerator.MoveNext()); | |
} | |
else | |
{ | |
elementCounts = TryBuildElementCountsWithKnownCount(thatEnumerator, remaining, cmp); | |
} | |
} | |
else if (TryFastCount(that, out thatCount)) | |
{ | |
// we know that's count => use this as the build side | |
probeSide = thatEnumerator; | |
var remaining = thatCount - itemsEnumerated; | |
elementCounts = TryBuildElementCountsWithKnownCount(thisEnumerator, remaining, cmp); | |
} | |
else | |
{ | |
// when we don't know either count, just use that as the build side arbitrarily | |
probeSide = thisEnumerator; | |
elementCounts = new CountingSet<TElement>(cmp); | |
do | |
{ | |
elementCounts.Increment(thatEnumerator.Current); | |
} | |
while (thatEnumerator.MoveNext()); | |
} | |
// check whether we failed to construct a dictionary. This happens when we know | |
// one of the counts and we detect, during construction, that the counts are unequal | |
if (elementCounts == null) | |
{ | |
return false; | |
} | |
// probe the dictionary with the probe side enumerator | |
do | |
{ | |
if (!elementCounts.TryDecrement(probeSide.Current)) | |
{ | |
// element in probe not in build => not equal | |
return false; | |
} | |
} | |
while (probeSide.MoveNext()); | |
// we are equal only if the loop above completely cleared out the dictionary | |
return elementCounts.IsEmpty; | |
} | |
} | |
/// <summary> | |
/// Constructs a count dictionary, staying mindful of the known number of elements | |
/// so that we bail early (returning null) if we detect a count mismatch | |
/// </summary> | |
private static CountingSet<TKey> TryBuildElementCountsWithKnownCount<TKey>( | |
IEnumerator<TKey> elements, | |
int remaining, | |
IEqualityComparer<TKey> comparer) | |
{ | |
if (remaining == 0) | |
{ | |
// don't build the dictionary at all if nothing should be in it | |
return null; | |
} | |
const int MaxInitialElementCountsCapacity = 1024; | |
var elementCounts = new CountingSet<TKey>(capacity: Math.Min(remaining, MaxInitialElementCountsCapacity), comparer: comparer); | |
elementCounts.Increment(elements.Current); | |
while (elements.MoveNext()) | |
{ | |
if (--remaining < 0) | |
{ | |
// too many elements | |
return null; | |
} | |
elementCounts.Increment(elements.Current); | |
} | |
if (remaining > 0) | |
{ | |
// too few elements | |
return null; | |
} | |
return elementCounts; | |
} | |
/// <summary> | |
/// Key Lookup Reduction optimization: this custom datastructure halves the number of <see cref="IEqualityComparer{T}.GetHashCode(T)"/> | |
/// and <see cref="IEqualityComparer{T}.Equals(T, T)"/> operations by building in the increment/decrement operations of a counting dictionary. | |
/// This also solves <see cref="Dictionary{TKey, TValue}"/>'s issues with null keys | |
/// </summary> | |
private sealed class CountingSet<T> | |
{ | |
// picked based on observing unit test performance | |
private const double MaxLoad = .62; | |
private readonly IEqualityComparer<T> comparer; | |
private Bucket[] buckets; | |
private int populatedBucketCount; | |
/// <summary> | |
/// When we reach this count, we need to resize | |
/// </summary> | |
private int nextResizeCount; | |
public CountingSet(IEqualityComparer<T> comparer, int capacity = 0) | |
{ | |
this.comparer = comparer; | |
// we pick the initial length by assuming our current table is one short of the desired | |
// capacity and then using our standard logic of picking the next valid table size | |
this.buckets = new Bucket[GetNextTableSize((int)(capacity / MaxLoad) - 1)]; | |
this.nextResizeCount = this.CalculateNextResizeCount(); | |
} | |
public bool IsEmpty { get { return this.populatedBucketCount == 0; } } | |
public void Increment(T item) | |
{ | |
int bucketIndex; | |
uint hashCode; | |
if (this.TryFindBucket(item, out bucketIndex, out hashCode)) | |
{ | |
// if a bucket already existed, just update it's count | |
++this.buckets[bucketIndex].Count; | |
} | |
else | |
{ | |
// otherwise, claim a new bucket | |
this.buckets[bucketIndex].HashCode = hashCode; | |
this.buckets[bucketIndex].Value = item; | |
this.buckets[bucketIndex].Count = 1; | |
++this.populatedBucketCount; | |
// resize the table if we've grown too full | |
if (this.populatedBucketCount == this.nextResizeCount) | |
{ | |
var newBuckets = new Bucket[GetNextTableSize(this.buckets.Length)]; | |
// rehash | |
for (var i = 0; i < this.buckets.Length; ++i) | |
{ | |
var oldBucket = this.buckets[i]; | |
if (oldBucket.HashCode != 0) | |
{ | |
var newBucketIndex = oldBucket.HashCode % newBuckets.Length; | |
while (true) | |
{ | |
if (newBuckets[newBucketIndex].HashCode == 0) | |
{ | |
newBuckets[newBucketIndex] = oldBucket; | |
break; | |
} | |
newBucketIndex = (newBucketIndex + 1) % newBuckets.Length; | |
} | |
} | |
} | |
this.buckets = newBuckets; | |
this.nextResizeCount = this.CalculateNextResizeCount(); | |
} | |
} | |
} | |
public bool TryDecrement(T item) | |
{ | |
int bucketIndex; | |
uint ignored; | |
if (this.TryFindBucket(item, out bucketIndex, out ignored) | |
&& this.buckets[bucketIndex].Count > 0) | |
{ | |
if (--this.buckets[bucketIndex].Count == 0) | |
{ | |
// Note: we can't do this because it messes up our try-find logic | |
//// mark as unpopulated. Not strictly necessary because CollectionEquals always does all increments | |
//// before all decrements currently. However, this is very cheap to do and allowing the collection to | |
//// "just work" in any situation is a nice benefit | |
//// this.buckets[bucketIndex].HashCode = 0; | |
--this.populatedBucketCount; | |
} | |
return true; | |
} | |
return false; | |
} | |
private bool TryFindBucket(T item, out int index, out uint hashCode) | |
{ | |
// we convert the raw hash code to a uint to get correctly-signed mod operations | |
// and get rid of the zero value so that we can use 0 to mean "unoccupied" | |
var rawHashCode = this.comparer.GetHashCode(item); | |
hashCode = rawHashCode == 0 ? uint.MaxValue : unchecked((uint)rawHashCode); | |
var bestBucketIndex = (int)(hashCode % this.buckets.Length); | |
var bucketIndex = bestBucketIndex; | |
while (true) // guaranteed to terminate because of how we set load factor | |
{ | |
var bucket = this.buckets[bucketIndex]; | |
if (bucket.HashCode == 0) | |
{ | |
// found unoccupied bucket | |
index = bucketIndex; | |
return false; | |
} | |
if (bucket.HashCode == hashCode && this.comparer.Equals(bucket.Value, item)) | |
{ | |
// found matching bucket | |
index = bucketIndex; | |
return true; | |
} | |
// otherwise march on to the next adjacent bucket | |
bucketIndex = (bucketIndex + 1) % this.buckets.Length; | |
} | |
} | |
private int CalculateNextResizeCount() | |
{ | |
return (int)(MaxLoad * this.buckets.Length) + 1; | |
} | |
private static readonly int[] HashTableSizes = new[] | |
{ | |
// hash table primes from http://planetmath.org/goodhashtableprimes | |
23, 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, | |
24593, 49157, 98317, 196613, 393241, 786433, 1572869, | |
3145739, 6291469, 12582917, 25165843, 50331653, 100663319, | |
201326611, 402653189, 805306457, 1610612741, | |
// the first two values are (1) a prime roughly half way between the previous value and int.MaxValue | |
// and (2) the prime closest too, but not above, int.MaxValue. The maximum size is, of course, int.MaxValue | |
1879048201, 2147483629, int.MaxValue | |
}; | |
private static int GetNextTableSize(int currentSize) | |
{ | |
for (var i = 0; i < HashTableSizes.Length; ++i) | |
{ | |
var nextSize = HashTableSizes[i]; | |
if (nextSize > currentSize) { return nextSize; } | |
} | |
throw new InvalidOperationException("Hash table cannot expand further"); | |
} | |
[DebuggerDisplay("{Value}, {Count}, {HashCode}")] | |
private struct Bucket | |
{ | |
// note: 0 (default) means the bucket is unoccupied | |
internal uint HashCode; | |
internal T Value; | |
internal int Count; | |
} | |
} | |
#endregion | |
private static bool TryFastCount<T>(IEnumerable<T> @this, out int count) | |
{ | |
var collection = @this as ICollection<T>; | |
if (collection != null) | |
{ | |
count = collection.Count; | |
return true; | |
} | |
var readOnlyCollection = @this as IReadOnlyCollection<T>; | |
if (readOnlyCollection != null) | |
{ | |
count = readOnlyCollection.Count; | |
return true; | |
} | |
count = -1; | |
return false; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment