Last active
December 14, 2016 20:10
-
-
Save kflu/60fb3117d6e68198b22c9d200832ddcc to your computer and use it in GitHub Desktop.
This profiling program shows that on a 64 bit machine, binary search on a int64 list is faster than binary searching on a in32 list.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections; | |
using System.Collections.Generic; | |
using System.Diagnostics; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
using MathNet.Numerics.Statistics; | |
namespace HashVsBinarySearch | |
{ | |
/// <summary> | |
/// This profiling program shows that on a 64 bit machine, binary search on a int64 list is faster than binary searching on a in32 list. | |
/// In fact, the 64 bit binary search is so fast that it even slightly outperforms hashset lookup. Although not a common sense, as I performed | |
/// CPU sampling based profiling, HashSet.Contains() takes quite a portion of the samples, GetHashCode only takes a 0.2% of HashSet.Contains(). | |
/// So I guess it's the implementation of HashSet.Contains() makes it slow. | |
/// | |
/// But the point is - 64 bit binary search on 64 bit machine is really fast! (32bit binary search is slower than 32bit hashset lookup) | |
/// | |
/// intHashSet | |
/// Count: 2000000, nQueries: 10000000 | |
/// Elapsed: 00:00:01.5116004, Found rate: 0.0009278 | |
/// | |
/// longHashSet | |
/// Count: 2000000, nQueries: 10000000 | |
/// Elapsed: 00:00:02.0519113, Found rate: 0 | |
/// | |
/// intList | |
/// Count: 2000000, nQueries: 10000000 | |
/// Elapsed: 00:00:03.0172173, Found rate: 0.0009281 | |
/// | |
/// longList | |
/// Count: 2000000, nQueries: 10000000 | |
/// Elapsed: 00:00:01.1434541, Found rate: 0 | |
/// | |
/// </summary> | |
public class Program | |
{ | |
const int count = 2000000; | |
const int nQueries = 10000000; | |
const long magic = 0x1234123412341234L; | |
const int magicInt = 0x12341234; | |
static void Main(string[] args) | |
{ | |
//Main_Profile(args); | |
if (args[0].ToLower() == "list") Main_HashSetRAM<List<long>, long>(args.Skip(1).ToArray(), (seed, size) => GenLongSet(seed, size).OrderBy(x => x).ToList()); | |
else if (args[0].ToLower() == "hashset") Main_HashSetRAM<HashSet<long>, long>(args.Skip(1).ToArray(), (seed, size) => GenLongSet(seed, size)); | |
} | |
public static void Main_HashSetRAM<T, TElement>(string[] args, Func<Random, int, T> genSet) where T : IEnumerable<TElement> | |
{ | |
Random seed = new Random(); | |
int size = int.Parse(args[0]); | |
GC.Collect(); | |
var ram = GC.GetTotalMemory(true); | |
var set = genSet(seed, size); | |
GC.Collect(); | |
var ram2 = GC.GetTotalMemory(true); | |
Console.WriteLine($"{ram2 -ram}"); | |
Console.WriteLine($"set size: {set.Count()}"); | |
} | |
public static void Main_Profile(string[] args) | |
{ | |
Random seed = new Random(); | |
HashSet<int> intHashSet = GenIntSet(seed); | |
List<int> intList = intHashSet.OrderBy(x=>x).ToList(); | |
HashSet<long> longHashSet = GenLongSet(seed); | |
List<long> longList = longHashSet.OrderBy(x => x).ToList(); | |
Console.WriteLine(longList.First(x => x > 0)); | |
long prev = long.MinValue; | |
foreach (var item in longList) | |
{ | |
if (item < prev) throw new InvalidOperationException(); | |
prev = item; | |
} | |
//Func<long> genLong = () => RandLong(seed); | |
//Func<int> genInt = () => seed.Next(int.MinValue, int.MaxValue); | |
Func<long> genLong = () => seed.Next(); | |
Func<int> genInt = () => seed.Next(); | |
//Func<long> genLong = () => magic; // query a number that's not in the set | |
//Func<int> genInt = () => magicInt; | |
Console.WriteLine("intHashSet"); | |
Profile<HashSet<int>, int>(seed, intHashSet, nQueries, (set, x) => ((HashSet<int>)set).Contains(x), genInt); | |
Console.WriteLine("longHashSet"); | |
Profile<HashSet<long>, long>(seed, longHashSet, nQueries, (set, x) => ((HashSet<long>)set).Contains(x), genLong); | |
Console.WriteLine("intList"); | |
//Profile<List<int>, int>(seed, intList, nQueries, (list, x) => BinarySearch(list, x) >= 0, genInt); | |
Profile<List<int>, int>(seed, intList, nQueries, (list, x) => list.BinarySearch(x) >= 0, genInt); | |
Console.WriteLine("longList"); | |
//Profile<List<long>, long>(seed, longList, nQueries, (list, x) => BinarySearch(list,x) >= 0, genLong); | |
Profile<List<long>, long>(seed, longList, nQueries, (list, x) => list.BinarySearch(x) >= 0, genLong); | |
// make sure GC isn't kick in to impact performance | |
Console.WriteLine($"{intHashSet.Count} {intList.Count} {longHashSet.Count} {longList.Count}"); | |
} | |
public static void Profile<T, TElement>(Random seed, T set, int nQueries, Func<T, TElement, bool> query, Func<TElement> genSample) where T : IEnumerable<TElement> | |
{ | |
int count = set.Count(); | |
Console.WriteLine($"Count: {count}, nQueries: {nQueries}"); | |
var sw = new Stopwatch(); | |
sw.Restart(); | |
int found = 0; | |
for (int i = 0; i < nQueries; i++) | |
{ | |
var res = query(set, genSample()); | |
if (res) found++; | |
} | |
sw.Stop(); | |
Console.WriteLine($"Elapsed: {sw.Elapsed}, Found rate: {found / (double)nQueries}"); | |
} | |
public static HashSet<int> GenIntSet(Random seed) | |
{ | |
HashSet<int> set = new HashSet<int>(); | |
for (int i = 0; i < count; i++) | |
{ | |
while (true) | |
{ | |
int x = seed.Next(int.MinValue, int.MaxValue); | |
if (x == magicInt) continue; // don't add magic number | |
if (set.Add(x)) | |
{ | |
break; | |
} | |
} | |
} | |
Console.WriteLine("Set size: {0}", set.Count); | |
return set; | |
} | |
public static HashSet<long> GenLongSet(Random seed, int count = -1) | |
{ | |
if (count < 0) count = Program.count; | |
HashSet<long> set = new HashSet<long>(); | |
for (int i = 0; i < count; i++) | |
{ | |
while (true) | |
{ | |
long x = RandLong(seed); | |
if (x == magic) continue; // don't add magic number | |
if (set.Add(x)) | |
{ | |
break; | |
} | |
} | |
} | |
//Console.WriteLine("Set size: {0}", set.Count); | |
return set; | |
} | |
public static long RandLong(Random seed) | |
{ | |
//return seed.Next(); | |
//int x = seed.Next(); | |
//return ((long)((x << 1) ^ x) << 32) | (uint)x; | |
//int x = seed.Next(); | |
//return ((long)(x * 6367) << 32) | (uint)x; | |
byte[] buf = new byte[8]; | |
seed.NextBytes(buf); | |
return BitConverter.ToInt64(buf, 0); | |
} | |
private static int BinarySearch<T>(List<T> array, T value) where T : IComparable<T> | |
{ | |
int index = 0; | |
int length = array.Count; | |
int lo = index; | |
int hi = index + length - 1; | |
while (lo <= hi) | |
{ | |
int i = lo + ((hi - lo) >> 1); | |
int order; | |
if (array[i] == null) | |
{ | |
order = (value == null) ? 0 : -1; | |
} | |
else | |
{ | |
order = array[i].CompareTo(value); | |
} | |
if (order == 0) | |
{ | |
return i; | |
} | |
if (order < 0) | |
{ | |
lo = i + 1; | |
} | |
else | |
{ | |
hi = i - 1; | |
} | |
} | |
return ~lo; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#load "./packages/FsLab/FsLab.fsx" | |
open FSharp.Charting | |
open MathNet.Numerics.Statistics | |
open MathNet.Numerics.Statistics.Statistics | |
let hashfile = @"C:\Users\Desktop\HashValues.txt" | |
open System.IO | |
let readLines (transform) (file:string) = | |
let rec reads (stream: StreamReader) (transform) = | |
seq { | |
match stream.ReadLine() with | |
| null -> () | |
| line -> | |
yield transform line | |
yield! reads stream transform | |
} | |
use stream = new StreamReader(new FileStream(file, FileMode.Open)) | |
reads stream transform |> Array.ofSeq | |
let data = readLines int64 hashfile | |
let hashes = data |> Array.map (fun x -> x.GetHashCode()) | |
(** This is quite uniformly distributed *) | |
let hist = Histogram(hashes |> Array.map float, 200) | |
let chart_hist = | |
[0 .. hist.BucketCount-1] | |
|> List.map (fun i -> i, hist.[i].Count) | |
|> Chart.Column | |
printfn "hashes in range: [%f, %f]. Int.min=%d Int.max=%d" hist.LowerBound hist.UpperBound System.Int32.MinValue System.Int32.MaxValue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment