Skip to content

Instantly share code, notes, and snippets.

@dariodip
Last active January 18, 2022 11:02
Show Gist options
  • Save dariodip/8ebcd96a8cca33f945c468f372f56f62 to your computer and use it in GitHub Desktop.
Save dariodip/8ebcd96a8cca33f945c468f372f56f62 to your computer and use it in GitHub Desktop.
Find Duplicate
from bloom_filter import BloomFilter
def find_duplicate_bf(a: list, bf_prob: float=0.5):
bf = BloomFilter(len(a), bf_prob)
for (i, el) in enumerate(a):
if bf.check(el): # element is in bloom filter
for j in range(i): # check the previous elements
if a[j] == el:
return el
else:
bf.add(el) # add element to bloom filter
import math
import mmh3
from bitarray import bitarray
class BloomFilter:
'''
Class for Bloom filter, using murmur3 hash function
'''
def __init__(self, count, fp_prob):
'''
count: int
Number of items to be stored in Bloom Filter
fp_prob: float
False positive probability (in decimal)
'''
self.fp_prob = fp_prob
self.size = BloomFilter.get_size(count, fp_prob)
self.hash_count = BloomFilter.get_hash_count(self.size, count)
self.bit_array = bitarray(self.size)
self.bit_array.setall(0)
def add(self, item):
'''
Add an item to the filter
'''
# create a generator having a digest for every hash function
digests = ( mmh3.hash(item, i) % self.size for i in range(self.hash_count) )
for i in digests:
self.bit_array[i] = True
def check(self, item) -> bool:
'''
Check for existence of an item in filter
'''
# create a generator having a digest for every hash function
digests = ( mmh3.hash(item, i) % self.size for i in range(self.hash_count) )
for i in digests:
# if at least one is false, return false
if self.bit_array[i] == False:
return False
return True
@staticmethod
def get_size(n, p) -> int:
'''
Return the size of bit array(m) to be used
n : int
number of items expected to be stored in filter
p : float
False Positive probability in decimal
'''
m = -(n * math.log(p))/(math.log(2)**2)
return int(m)
@staticmethod
def get_hash_count(m, n) -> int:
'''
Return the hash function(k) to be used
m : int
size of bit array
n : int
number of items expected to be stored in filter
'''
k = (m/n) * math.log(2)
return int(k)
def find_duplicate_sum(a: list) -> int:
sum_els = lambda x: int(x * (x + 1) / 2)
sum_before = sum_els(min(a) - 1)
sum_after = sum_els(max(a))
list_sum = sum(a)
return list_sum - (sum_after - sum_before)
def find_duplicate_dummy(a: list):
# for each index but the last
for i in range(len(a) - 1):
# for every other element
for j in range(i + 1, len(a)):
if a[i] == a[j]:
return a[i]
def find_duplicates_floyd(a: list):
# both tortoise and hare start
# from the beginning
tortoise = a[0]
hare = a[0]
while True:
tortoise = a[tortoise]
hare = a[a[hare]]
if tortoise == hare:
# cycle found
break
# go to the beginning of the cycle
ptr1 = a[0]
ptr2 = tortoise
while ptr1 != ptr2:
ptr1 = a[ptr1]
ptr2 = a[ptr2]
return ptr1
def find_duplicate_map(a: list):
seen = dict()
for i in a:
if i in seen:
# here is a duplicate
return i
seen[i] = True
import random
def generate_random_list(n, k):
"""
Return an unsorted list of n random integers from 0 to k
"""
return [random.randint(0, k) for _ in range(n)]
def generate_random_list_pigeon(n):
"""
Return an unsorted list of n random elements from 1 to n-1.
Exacly one element is duplicated.
"""
a = [i for i in range(n)]
a[0] = a[random.randint(0, n-1)]
random.shuffle(a) # shuffle the list
return a
def find_duplicate_sorted(a: list):
a.sort()
for i in range(1, len(a)):
# duplicates are adjacent
if a[i-1] == a[i]:
return a[i]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment