johntyree · December 20, 2012 10:54
diff --git a/cyhash.pyx b/cyhash.pyx
 # cython: infer_types=True

 # Use the C math library to avoid Python overhead.
 from libc cimport math
 # For boundscheck below.
 import cython
 # We're lazy so we'll let Numpy handle our array memory management.
 import numpy as np
 # You would normally also import the Numpy pxd to get faster access to the Numpy
 # API, but it requires some fancier compilation options so I'll leave it out for
 # this demo.
 # cimport numpy as np

 import random

 # This is a small function that doesn't need to be exposed to Python at all. Use
 # `cdef` instead of `def` and inline it.
 cdef inline int h3(int a,int b,int c,int d, int m,int x):
    return (a*x**2 + b*x+c) % m

 # If we want to live fast and dangerously, we tell cython not to check our array
 # indices for IndexErrors. This means we CAN overrun our array and crash the
 # program or screw up our stack. Use with caution.
 # @cython.boundscheck(False)
 # `cpdef` so that calling this function from another Cython (or C) function can
 # skip the Python function call overhead, while still allowing us to use it from
 # Python.
 cpdef floyd(int[:] inputx):
    # Type the variables in the scope of the function.
    cdef int a,b,c,d, value, cyclelimit
    cdef unsigned int dupefound = 0
    cdef unsigned int nohashcalls = 0
    cdef unsigned int loopno, pos, j

    # `m` has type int because inputx is already a Cython memory view and
    # `infer-types` is on.
    m = inputx.shape[0]

    cdef unsigned int loops = int(m*math.log(m))

    # Again using the memory view, but letting Numpy allocate an array of zeros.
    cdef int[:] listofpos = np.zeros(m, dtype=np.int32)

    # Keep this random sampling out of the loop
    cdef int[:, :] randoms = np.random.randint(0, m, (loops, 5)).astype(np.int32)

    for loopno in xrange(loops):
        if (dupefound == 1):
            break

        # From our precomputed array
        a = randoms[loopno, 0]
        b = randoms[loopno, 1]
        c = randoms[loopno, 2]
        d = randoms[loopno, 3]
        pos = randoms[loopno, 4]

        value = inputx[pos]

        # Unforunately, Memory View does not support "vectorized" operations
        # like standard Numpy arrays. Otherwise we'd use listofpos *= 0 here.
        for j in xrange(m):
            listofpos[j] = 0

        listofpos[pos] = 1
        setofvalues = set((value,))
        cyclelimit = int(math.sqrt(m))
        for j in xrange(cyclelimit):
            pos = h3(a, b, c, d, m, inputx[pos])
            nohashcalls += 1
            if (inputx[pos] in setofvalues):
                if (listofpos[pos]==1):
                    dupefound = 0
                else:
                    dupefound = 1
                    print "Duplicate found at position", pos, " and value", inputx[pos]
                break
            listofpos[pos] = 1
            setofvalues.add(inputx[pos])
    return dupefound, nohashcalls
diff --git a/cyhash_orig.pyx b/cyhash_orig.pyx
 import math, random
 cdef int a,b,c,d,m,pos,value, cyclelimit, nohashcalls   
 def h3(int a,int b,int c,int d, int m,int x):
    return (a*x**2 + b*x+c) %m    
 def floyd(inputx):
    dupefound, nohashcalls = (0,0)
    m = len(inputx)
    loops = int(m*math.log(m))
    for loopno in xrange(loops):
        if (dupefound == 1):
            break
        a = random.randrange(m)
        b = random.randrange(m)
        c = random.randrange(m)
        d = random.randrange(m)
        pos = random.randrange(m)
        value = inputx[pos]
        listofpos = [0] * m
        listofpos[pos] = 1
        setofvalues = set([value])
        cyclelimit = int(math.sqrt(m))
        for j in xrange(cyclelimit):
            pos = h3(a,b, c,d, m, inputx[pos])
            nohashcalls += 1    
            if (inputx[pos] in setofvalues):
                if (listofpos[pos]==1):
                    dupefound = 0
                else:
                    dupefound = 1
                    print "Duplicate found at position", pos, " and value", inputx[pos]
                break
            listofpos[pos] = 1
            setofvalues.add(inputx[pos])
    return dupefound, nohashcalls 
diff --git a/run.py b/run.py
 #/usr/bin/env python

 import math, random
 import numpy as np

 import pyximport; pyximport.install()
 import cyhash as cy

 m = 5000
 inputx = np.array(random.sample(xrange(m**2), m), dtype=np.int32)
 (dupefound, nohashcalls) = cy.floyd(inputx)
	# cython: infer_types=True

	# Use the C math library to avoid Python overhead.
	from libc cimport math
	# For boundscheck below.
	import cython
	# We're lazy so we'll let Numpy handle our array memory management.
	import numpy as np
	# You would normally also import the Numpy pxd to get faster access to the Numpy
	# API, but it requires some fancier compilation options so I'll leave it out for
	# this demo.
	# cimport numpy as np

	import random

	# This is a small function that doesn't need to be exposed to Python at all. Use
	# `cdef` instead of `def` and inline it.
	cdef inline int h3(int a,int b,int c,int d, int m,int x):
	return (ax2 + bx+c) % m

	# If we want to live fast and dangerously, we tell cython not to check our array
	# indices for IndexErrors. This means we CAN overrun our array and crash the
	# program or screw up our stack. Use with caution.
	# @cython.boundscheck(False)
	# `cpdef` so that calling this function from another Cython (or C) function can
	# skip the Python function call overhead, while still allowing us to use it from
	# Python.
	cpdef floyd(int[:] inputx):
	# Type the variables in the scope of the function.
	cdef int a,b,c,d, value, cyclelimit
	cdef unsigned int dupefound = 0
	cdef unsigned int nohashcalls = 0
	cdef unsigned int loopno, pos, j

	# `m` has type int because inputx is already a Cython memory view and
	# `infer-types` is on.
	m = inputx.shape[0]

	cdef unsigned int loops = int(m*math.log(m))

	# Again using the memory view, but letting Numpy allocate an array of zeros.
	cdef int[:] listofpos = np.zeros(m, dtype=np.int32)

	# Keep this random sampling out of the loop
	cdef int[:, :] randoms = np.random.randint(0, m, (loops, 5)).astype(np.int32)

	for loopno in xrange(loops):
	if (dupefound == 1):
	break

	# From our precomputed array
	a = randoms[loopno, 0]
	b = randoms[loopno, 1]
	c = randoms[loopno, 2]
	d = randoms[loopno, 3]
	pos = randoms[loopno, 4]

	value = inputx[pos]

	# Unforunately, Memory View does not support "vectorized" operations
	# like standard Numpy arrays. Otherwise we'd use listofpos *= 0 here.
	for j in xrange(m):
	listofpos[j] = 0

	listofpos[pos] = 1
	setofvalues = set((value,))
	cyclelimit = int(math.sqrt(m))
	for j in xrange(cyclelimit):
	pos = h3(a, b, c, d, m, inputx[pos])
	nohashcalls += 1
	if (inputx[pos] in setofvalues):
	if (listofpos[pos]==1):
	dupefound = 0
	else:
	dupefound = 1
	print "Duplicate found at position", pos, " and value", inputx[pos]
	break
	listofpos[pos] = 1
	setofvalues.add(inputx[pos])
	return dupefound, nohashcalls
	import math, random
	cdef int a,b,c,d,m,pos,value, cyclelimit, nohashcalls
	def h3(int a,int b,int c,int d, int m,int x):
	return (ax2 + bx+c) %m
	def floyd(inputx):
	dupefound, nohashcalls = (0,0)
	m = len(inputx)
	loops = int(m*math.log(m))
	for loopno in xrange(loops):
	if (dupefound == 1):
	break
	a = random.randrange(m)
	b = random.randrange(m)
	c = random.randrange(m)
	d = random.randrange(m)
	pos = random.randrange(m)
	value = inputx[pos]
	listofpos = [0] * m
	listofpos[pos] = 1
	setofvalues = set([value])
	cyclelimit = int(math.sqrt(m))
	for j in xrange(cyclelimit):
	pos = h3(a,b, c,d, m, inputx[pos])
	nohashcalls += 1
	if (inputx[pos] in setofvalues):
	if (listofpos[pos]==1):
	dupefound = 0
	else:
	dupefound = 1
	print "Duplicate found at position", pos, " and value", inputx[pos]
	break
	listofpos[pos] = 1
	setofvalues.add(inputx[pos])
	return dupefound, nohashcalls
	#/usr/bin/env python

	import math, random
	import numpy as np

	import pyximport; pyximport.install()
	import cyhash as cy

	m = 5000
	inputx = np.array(random.sample(xrange(m**2), m), dtype=np.int32)
	(dupefound, nohashcalls) = cy.floyd(inputx)