Created
April 25, 2011 22:23
-
-
Save bcg/941379 to your computer and use it in GitHub Desktop.
Set Intersection Comparisons
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <glib.h> | |
int | |
compare_nodes(gconstpointer a, gconstpointer b) { | |
if (a < b) | |
return -1; | |
else if (a > b) | |
return 1; | |
return 0; | |
} | |
gint matches = 0; | |
gboolean | |
foreach_node(gpointer key, gpointer value, gpointer user_data) { | |
GTree *tree= user_data; | |
if (g_tree_lookup(tree, key)) { | |
matches++; | |
} | |
return FALSE; | |
} | |
int | |
main(int argc, char * argv) { | |
GTimer *t = g_timer_new(); | |
gulong ms; | |
GTree *one = g_tree_new(compare_nodes); | |
GTree *two = g_tree_new(compare_nodes); | |
gint i=0; | |
g_timer_start(t); | |
for(i=0; i <= 999999; i++) { | |
g_tree_insert(one, GINT_TO_POINTER(i), GINT_TO_POINTER(i)); | |
} | |
g_timer_stop(t); | |
printf("%d inserted. Took %.0f seconds.\n", g_tree_nnodes(one), g_timer_elapsed(t, &ms)); | |
for(i=0; i <= 999999; i++) { | |
g_tree_insert(two, GINT_TO_POINTER(i), GINT_TO_POINTER(i)); | |
} | |
g_timer_start(t); | |
g_tree_foreach(one, (GTraverseFunc)foreach_node, two); | |
g_timer_stop(t); | |
printf("%d matches. Took %.0f seconds.\n", matches, g_timer_elapsed(t, &ms)); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <glib.h> | |
gint matches = 0; | |
void | |
foreach_node(gpointer key, gpointer value, gpointer user_data) { | |
GHashTable *ht = user_data; | |
if (g_hash_table_lookup(ht, key)) { | |
matches++; | |
} | |
return; | |
} | |
int | |
main(int argc, char * argv) { | |
GTimer *t = g_timer_new(); | |
gulong ms; | |
GHashTable *one = g_hash_table_new(NULL, NULL); | |
GHashTable *two = g_hash_table_new(NULL, NULL); | |
gint i=0; | |
g_timer_start(t); | |
for(i=0; i <= 9999999; i++) { | |
g_hash_table_insert(one, GINT_TO_POINTER(i), GINT_TO_POINTER(i)); | |
} | |
g_timer_stop(t); | |
printf("%d inserted. Took %.0f seconds.\n", g_hash_table_size(one), g_timer_elapsed(t, &ms)); | |
for(i=0; i <= 9999999; i++) { | |
g_hash_table_insert(two, GINT_TO_POINTER(i), GINT_TO_POINTER(i)); | |
} | |
g_timer_start(t); | |
g_hash_table_foreach(one, (GHFunc)foreach_node, two); | |
g_timer_stop(t); | |
printf("%d matches. Took %.0f seconds.\n", matches, g_timer_elapsed(t, &ms)); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var set = require ('./set').set // See below | |
var one = set(1), | |
two = set(1); | |
var t1 = new Date(); | |
for (i=0;i<=9999999;i=i+1) { | |
one.add(i); | |
} | |
console.log(one.size() + ' inserted in ' + Math.floor((new Date() - t1) / 1000)); | |
for (i=0;i<=9999999;i=i+1) { | |
two.add(i); | |
} | |
var matches = one.intersectionCount(two); | |
console.log(matches + ' matched in ' + Math.floor((new Date() - t1) / 1000)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.collection.mutable.Set | |
object Indexr extends Application { | |
var one = Set(1) | |
var two = Set(1) | |
var t = System.currentTimeMillis | |
for (i <- 0.until(9999999)) { | |
one += i | |
} | |
Console.println("Loaded set in " + ((System.currentTimeMillis-t)/ 1000.0)) | |
t = System.currentTimeMillis | |
two = one.clone | |
Console.println("Loaded set in " + ((System.currentTimeMillis-t)/ 1000.0)) | |
t = System.currentTimeMillis | |
val intersects = one intersect two | |
Console.println(intersects.size + " intersects in " + ((System.currentTimeMillis-t)/ 1000.0)); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SCALA | |
[info] Loaded set in 112.9 | |
[info] Loaded set in 7.873 | |
[info] 9999999 intersects in 9.156 | |
# Node.js | |
10000000 inserted in 16 | |
10000000 matched in 8 | |
# C / Glib / Hash | |
10000000 inserted. Took 1 seconds. | |
9999999 matches. Took 1 seconds. | |
# C / Glib / Tree | |
10000000 inserted. Took 4 seconds. | |
9999999 matches. Took 39 seconds. | |
# SCALA + TROVE w/o foreach | |
[info] Loaded set in 0.711 | |
[info] Loaded set in 0.727 | |
[info] 10000001 intersects in 0.403 | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gnu.trove.set.TIntSet | |
import gnu.trove.set.hash.TIntHashSet | |
object Indexr extends Application { | |
val n = 10000000 | |
val one = new TIntHashSet(n) | |
val two = new TIntHashSet(n) | |
var i = 0 | |
var t = System.currentTimeMillis | |
i = 0 | |
while(i <= n) { | |
one.add(i) | |
i += 1 | |
} | |
Console.println("Loaded set in " + ((System.currentTimeMillis-t)/ 1000.0)) | |
t = System.currentTimeMillis | |
i = 0 | |
while(i <= n) { | |
two.add(i) | |
i += 1 | |
} | |
Console.println("Loaded set in " + ((System.currentTimeMillis-t)/ 1000.0)) | |
t = System.currentTimeMillis | |
two.retainAll(one) | |
Console.println(two.size() + " intersects in " + ((System.currentTimeMillis-t)/ 1000.0)); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
exports.set = function () { | |
var members = {} | |
, eachArgument = function (args, fn) { Array.prototype.slice.apply(args).forEach (fn) } | |
, set = function (item) { return members.hasOwnProperty (item) } | |
set.has = set | |
set.union = function (other) { | |
var union = exports.set.apply ({}, set.members) | |
union.add.apply (set, other.members) | |
return union } | |
set.intersection = function (other) { | |
var intersection = exports.set () | |
set.members.forEach (function (elt) { | |
if (other (elt)) intersection.add (elt) }) | |
return intersection } | |
set.intersectionCount = function (other) { | |
var intersection = 0; | |
set.members.forEach (function (elt) { | |
if (other (elt)) intersection++; }) | |
return intersection } | |
set.add = function () { | |
eachArgument (arguments, function (arg) { members [arg] = true }) } | |
set.add.apply (set, arguments) | |
set.del = function (item) { | |
eachArgument (arguments, function (arg) { delete members [arg] }) } | |
set.__defineGetter__ ('members', function () { | |
var m = [] | |
for (k in members) if (members.hasOwnProperty (k)) m.push (k) | |
return m }) | |
delete set.size | |
set.size = function () {return set.members.length} | |
set.toString = function () { return "<Set:" + set.members.join (",") + ">" } | |
return set | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If you have huge n = 10M element sets, and you query them much more often than you add to them, it's probably worth considering an alternate implementation, like Bloom filters. These will be massively faster, since set intersection on a Bloom filter is a bitwise AND.
The price you pay is that there is a small false-positive probability (the filter will very rarely respond that something is a member of the set when in fact it is not) -- you can always verify later with a more expensive test whether it's really a member, though. And if you expect negatives much more frequently than positives, a Bloom filter can be a great solution.
Specifically, Bloom filters have the great property the set-addition or testing-membership operations are O(k), totally independent of the number of items in the set. (k is the number of hash functions that you run each item through when adding or testing for membership.) So if your hash functions are efficient, and if you query much more often than you add elements, this can be a massive win, and it's "embarrassingly parallel", as we say in the biz.