Skip to content

Instantly share code, notes, and snippets.

@bcg
Created April 25, 2011 22:23
Show Gist options
  • Save bcg/941379 to your computer and use it in GitHub Desktop.
Save bcg/941379 to your computer and use it in GitHub Desktop.
Set Intersection Comparisons
#include <stdio.h>
#include <glib.h>
int
compare_nodes(gconstpointer a, gconstpointer b) {
if (a < b)
return -1;
else if (a > b)
return 1;
return 0;
}
gint matches = 0;
gboolean
foreach_node(gpointer key, gpointer value, gpointer user_data) {
GTree *tree= user_data;
if (g_tree_lookup(tree, key)) {
matches++;
}
return FALSE;
}
int
main(int argc, char * argv) {
GTimer *t = g_timer_new();
gulong ms;
GTree *one = g_tree_new(compare_nodes);
GTree *two = g_tree_new(compare_nodes);
gint i=0;
g_timer_start(t);
for(i=0; i <= 999999; i++) {
g_tree_insert(one, GINT_TO_POINTER(i), GINT_TO_POINTER(i));
}
g_timer_stop(t);
printf("%d inserted. Took %.0f seconds.\n", g_tree_nnodes(one), g_timer_elapsed(t, &ms));
for(i=0; i <= 999999; i++) {
g_tree_insert(two, GINT_TO_POINTER(i), GINT_TO_POINTER(i));
}
g_timer_start(t);
g_tree_foreach(one, (GTraverseFunc)foreach_node, two);
g_timer_stop(t);
printf("%d matches. Took %.0f seconds.\n", matches, g_timer_elapsed(t, &ms));
}
#include <stdio.h>
#include <glib.h>
gint matches = 0;
void
foreach_node(gpointer key, gpointer value, gpointer user_data) {
GHashTable *ht = user_data;
if (g_hash_table_lookup(ht, key)) {
matches++;
}
return;
}
int
main(int argc, char * argv) {
GTimer *t = g_timer_new();
gulong ms;
GHashTable *one = g_hash_table_new(NULL, NULL);
GHashTable *two = g_hash_table_new(NULL, NULL);
gint i=0;
g_timer_start(t);
for(i=0; i <= 9999999; i++) {
g_hash_table_insert(one, GINT_TO_POINTER(i), GINT_TO_POINTER(i));
}
g_timer_stop(t);
printf("%d inserted. Took %.0f seconds.\n", g_hash_table_size(one), g_timer_elapsed(t, &ms));
for(i=0; i <= 9999999; i++) {
g_hash_table_insert(two, GINT_TO_POINTER(i), GINT_TO_POINTER(i));
}
g_timer_start(t);
g_hash_table_foreach(one, (GHFunc)foreach_node, two);
g_timer_stop(t);
printf("%d matches. Took %.0f seconds.\n", matches, g_timer_elapsed(t, &ms));
}
var set = require ('./set').set // See below
var one = set(1),
two = set(1);
var t1 = new Date();
for (i=0;i<=9999999;i=i+1) {
one.add(i);
}
console.log(one.size() + ' inserted in ' + Math.floor((new Date() - t1) / 1000));
for (i=0;i<=9999999;i=i+1) {
two.add(i);
}
var matches = one.intersectionCount(two);
console.log(matches + ' matched in ' + Math.floor((new Date() - t1) / 1000));
import scala.collection.mutable.Set
object Indexr extends Application {
var one = Set(1)
var two = Set(1)
var t = System.currentTimeMillis
for (i <- 0.until(9999999)) {
one += i
}
Console.println("Loaded set in " + ((System.currentTimeMillis-t)/ 1000.0))
t = System.currentTimeMillis
two = one.clone
Console.println("Loaded set in " + ((System.currentTimeMillis-t)/ 1000.0))
t = System.currentTimeMillis
val intersects = one intersect two
Console.println(intersects.size + " intersects in " + ((System.currentTimeMillis-t)/ 1000.0));
}
# SCALA
[info] Loaded set in 112.9
[info] Loaded set in 7.873
[info] 9999999 intersects in 9.156
# Node.js
10000000 inserted in 16
10000000 matched in 8
# C / Glib / Hash
10000000 inserted. Took 1 seconds.
9999999 matches. Took 1 seconds.
# C / Glib / Tree
10000000 inserted. Took 4 seconds.
9999999 matches. Took 39 seconds.
# SCALA + TROVE w/o foreach
[info] Loaded set in 0.711
[info] Loaded set in 0.727
[info] 10000001 intersects in 0.403
import gnu.trove.set.TIntSet
import gnu.trove.set.hash.TIntHashSet
object Indexr extends Application {
val n = 10000000
val one = new TIntHashSet(n)
val two = new TIntHashSet(n)
var i = 0
var t = System.currentTimeMillis
i = 0
while(i <= n) {
one.add(i)
i += 1
}
Console.println("Loaded set in " + ((System.currentTimeMillis-t)/ 1000.0))
t = System.currentTimeMillis
i = 0
while(i <= n) {
two.add(i)
i += 1
}
Console.println("Loaded set in " + ((System.currentTimeMillis-t)/ 1000.0))
t = System.currentTimeMillis
two.retainAll(one)
Console.println(two.size() + " intersects in " + ((System.currentTimeMillis-t)/ 1000.0));
}
exports.set = function () {
var members = {}
, eachArgument = function (args, fn) { Array.prototype.slice.apply(args).forEach (fn) }
, set = function (item) { return members.hasOwnProperty (item) }
set.has = set
set.union = function (other) {
var union = exports.set.apply ({}, set.members)
union.add.apply (set, other.members)
return union }
set.intersection = function (other) {
var intersection = exports.set ()
set.members.forEach (function (elt) {
if (other (elt)) intersection.add (elt) })
return intersection }
set.intersectionCount = function (other) {
var intersection = 0;
set.members.forEach (function (elt) {
if (other (elt)) intersection++; })
return intersection }
set.add = function () {
eachArgument (arguments, function (arg) { members [arg] = true }) }
set.add.apply (set, arguments)
set.del = function (item) {
eachArgument (arguments, function (arg) { delete members [arg] }) }
set.__defineGetter__ ('members', function () {
var m = []
for (k in members) if (members.hasOwnProperty (k)) m.push (k)
return m })
delete set.size
set.size = function () {return set.members.length}
set.toString = function () { return "<Set:" + set.members.join (",") + ">" }
return set
}
@fj
Copy link

fj commented May 2, 2011

If you have huge n = 10M element sets, and you query them much more often than you add to them, it's probably worth considering an alternate implementation, like Bloom filters. These will be massively faster, since set intersection on a Bloom filter is a bitwise AND.

The price you pay is that there is a small false-positive probability (the filter will very rarely respond that something is a member of the set when in fact it is not) -- you can always verify later with a more expensive test whether it's really a member, though. And if you expect negatives much more frequently than positives, a Bloom filter can be a great solution.

Specifically, Bloom filters have the great property the set-addition or testing-membership operations are O(k), totally independent of the number of items in the set. (k is the number of hash functions that you run each item through when adding or testing for membership.) So if your hash functions are efficient, and if you query much more often than you add elements, this can be a massive win, and it's "embarrassingly parallel", as we say in the biz.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment