Skip to content

Instantly share code, notes, and snippets.

@kflu
Last active December 14, 2016 20:10
Show Gist options
  • Save kflu/60fb3117d6e68198b22c9d200832ddcc to your computer and use it in GitHub Desktop.
Save kflu/60fb3117d6e68198b22c9d200832ddcc to your computer and use it in GitHub Desktop.
This profiling program shows that on a 64 bit machine, binary search on a int64 list is faster than binary searching on a in32 list.
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using MathNet.Numerics.Statistics;
namespace HashVsBinarySearch
{
/// <summary>
/// This profiling program shows that on a 64 bit machine, binary search on a int64 list is faster than binary searching on a in32 list.
/// In fact, the 64 bit binary search is so fast that it even slightly outperforms hashset lookup. Although not a common sense, as I performed
/// CPU sampling based profiling, HashSet.Contains() takes quite a portion of the samples, GetHashCode only takes a 0.2% of HashSet.Contains().
/// So I guess it's the implementation of HashSet.Contains() makes it slow.
///
/// But the point is - 64 bit binary search on 64 bit machine is really fast! (32bit binary search is slower than 32bit hashset lookup)
///
/// intHashSet
/// Count: 2000000, nQueries: 10000000
/// Elapsed: 00:00:01.5116004, Found rate: 0.0009278
///
/// longHashSet
/// Count: 2000000, nQueries: 10000000
/// Elapsed: 00:00:02.0519113, Found rate: 0
///
/// intList
/// Count: 2000000, nQueries: 10000000
/// Elapsed: 00:00:03.0172173, Found rate: 0.0009281
///
/// longList
/// Count: 2000000, nQueries: 10000000
/// Elapsed: 00:00:01.1434541, Found rate: 0
///
/// </summary>
public class Program
{
const int count = 2000000;
const int nQueries = 10000000;
const long magic = 0x1234123412341234L;
const int magicInt = 0x12341234;
static void Main(string[] args)
{
//Main_Profile(args);
if (args[0].ToLower() == "list") Main_HashSetRAM<List<long>, long>(args.Skip(1).ToArray(), (seed, size) => GenLongSet(seed, size).OrderBy(x => x).ToList());
else if (args[0].ToLower() == "hashset") Main_HashSetRAM<HashSet<long>, long>(args.Skip(1).ToArray(), (seed, size) => GenLongSet(seed, size));
}
public static void Main_HashSetRAM<T, TElement>(string[] args, Func<Random, int, T> genSet) where T : IEnumerable<TElement>
{
Random seed = new Random();
int size = int.Parse(args[0]);
GC.Collect();
var ram = GC.GetTotalMemory(true);
var set = genSet(seed, size);
GC.Collect();
var ram2 = GC.GetTotalMemory(true);
Console.WriteLine($"{ram2 -ram}");
Console.WriteLine($"set size: {set.Count()}");
}
public static void Main_Profile(string[] args)
{
Random seed = new Random();
HashSet<int> intHashSet = GenIntSet(seed);
List<int> intList = intHashSet.OrderBy(x=>x).ToList();
HashSet<long> longHashSet = GenLongSet(seed);
List<long> longList = longHashSet.OrderBy(x => x).ToList();
Console.WriteLine(longList.First(x => x > 0));
long prev = long.MinValue;
foreach (var item in longList)
{
if (item < prev) throw new InvalidOperationException();
prev = item;
}
//Func<long> genLong = () => RandLong(seed);
//Func<int> genInt = () => seed.Next(int.MinValue, int.MaxValue);
Func<long> genLong = () => seed.Next();
Func<int> genInt = () => seed.Next();
//Func<long> genLong = () => magic; // query a number that's not in the set
//Func<int> genInt = () => magicInt;
Console.WriteLine("intHashSet");
Profile<HashSet<int>, int>(seed, intHashSet, nQueries, (set, x) => ((HashSet<int>)set).Contains(x), genInt);
Console.WriteLine("longHashSet");
Profile<HashSet<long>, long>(seed, longHashSet, nQueries, (set, x) => ((HashSet<long>)set).Contains(x), genLong);
Console.WriteLine("intList");
//Profile<List<int>, int>(seed, intList, nQueries, (list, x) => BinarySearch(list, x) >= 0, genInt);
Profile<List<int>, int>(seed, intList, nQueries, (list, x) => list.BinarySearch(x) >= 0, genInt);
Console.WriteLine("longList");
//Profile<List<long>, long>(seed, longList, nQueries, (list, x) => BinarySearch(list,x) >= 0, genLong);
Profile<List<long>, long>(seed, longList, nQueries, (list, x) => list.BinarySearch(x) >= 0, genLong);
// make sure GC isn't kick in to impact performance
Console.WriteLine($"{intHashSet.Count} {intList.Count} {longHashSet.Count} {longList.Count}");
}
public static void Profile<T, TElement>(Random seed, T set, int nQueries, Func<T, TElement, bool> query, Func<TElement> genSample) where T : IEnumerable<TElement>
{
int count = set.Count();
Console.WriteLine($"Count: {count}, nQueries: {nQueries}");
var sw = new Stopwatch();
sw.Restart();
int found = 0;
for (int i = 0; i < nQueries; i++)
{
var res = query(set, genSample());
if (res) found++;
}
sw.Stop();
Console.WriteLine($"Elapsed: {sw.Elapsed}, Found rate: {found / (double)nQueries}");
}
public static HashSet<int> GenIntSet(Random seed)
{
HashSet<int> set = new HashSet<int>();
for (int i = 0; i < count; i++)
{
while (true)
{
int x = seed.Next(int.MinValue, int.MaxValue);
if (x == magicInt) continue; // don't add magic number
if (set.Add(x))
{
break;
}
}
}
Console.WriteLine("Set size: {0}", set.Count);
return set;
}
public static HashSet<long> GenLongSet(Random seed, int count = -1)
{
if (count < 0) count = Program.count;
HashSet<long> set = new HashSet<long>();
for (int i = 0; i < count; i++)
{
while (true)
{
long x = RandLong(seed);
if (x == magic) continue; // don't add magic number
if (set.Add(x))
{
break;
}
}
}
//Console.WriteLine("Set size: {0}", set.Count);
return set;
}
public static long RandLong(Random seed)
{
//return seed.Next();
//int x = seed.Next();
//return ((long)((x << 1) ^ x) << 32) | (uint)x;
//int x = seed.Next();
//return ((long)(x * 6367) << 32) | (uint)x;
byte[] buf = new byte[8];
seed.NextBytes(buf);
return BitConverter.ToInt64(buf, 0);
}
private static int BinarySearch<T>(List<T> array, T value) where T : IComparable<T>
{
int index = 0;
int length = array.Count;
int lo = index;
int hi = index + length - 1;
while (lo <= hi)
{
int i = lo + ((hi - lo) >> 1);
int order;
if (array[i] == null)
{
order = (value == null) ? 0 : -1;
}
else
{
order = array[i].CompareTo(value);
}
if (order == 0)
{
return i;
}
if (order < 0)
{
lo = i + 1;
}
else
{
hi = i - 1;
}
}
return ~lo;
}
}
}
#load "./packages/FsLab/FsLab.fsx"
open FSharp.Charting
open MathNet.Numerics.Statistics
open MathNet.Numerics.Statistics.Statistics
let hashfile = @"C:\Users\Desktop\HashValues.txt"
open System.IO
let readLines (transform) (file:string) =
let rec reads (stream: StreamReader) (transform) =
seq {
match stream.ReadLine() with
| null -> ()
| line ->
yield transform line
yield! reads stream transform
}
use stream = new StreamReader(new FileStream(file, FileMode.Open))
reads stream transform |> Array.ofSeq
let data = readLines int64 hashfile
let hashes = data |> Array.map (fun x -> x.GetHashCode())
(** This is quite uniformly distributed *)
let hist = Histogram(hashes |> Array.map float, 200)
let chart_hist =
[0 .. hist.BucketCount-1]
|> List.map (fun i -> i, hist.[i].Count)
|> Chart.Column
printfn "hashes in range: [%f, %f]. Int.min=%d Int.max=%d" hist.LowerBound hist.UpperBound System.Int32.MinValue System.Int32.MaxValue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment