У меня есть фильтр Блума, реализованный в Python, с методами добавления новых элементов и проверки, включен ли элемент в фильтр.Я хотел бы знать, возможно ли реализовать запросы диапазона в сублинейном пространстве (или времени), предполагая, что все входные данные являются целочисленными.Если нет, я должен использовать другую технологию, а не Python?
Фильтр Блума
class BloomFilter(object):
def __init__(self, n_items, fp_prob):
'''
n_items : Number of items expected to be stored in bloom filter
fp_prob : False Positive probability in decimal
'''
# False posible probability in decimal
self.fp_prob = fp_prob
# Size of bit array to use
self.size = self.get_bit_array_size(n_items, fp_prob)
# number of hash functions to use
self.hash_count = self.get_number_of_hash_functions(self.size, n_items)
# Bit array of given size
self.bit_array = bitarray(self.size)
# initialize all bits as 0
self.bit_array.setall(0)
def len(self):
return self.size
def add(self, item): # add an element to the filter
hash_values = []
for i in range(self.hash_count):
# create hash_value for given item.
# i work as seed to mmh3.hash() function
# With different seed, the hash_value created is different
hash_value = mmh3.hash(bytes(item), i) % self.size
hash_values.append(hash_value)
# set the bit True in bit_array
self.bit_array[hash_value] = True
def check(self, item): # check for element existence
for i in range(self.hash_count):
hash_value = mmh3.hash(bytes(item), i) % self.size
if self.bit_array[hash_value] is False:
# if any of bit is False then,its not present
# in filter
# else there is probability that it exist
return False
return True
# My range query function
def range_check(self, item, item):
@classmethod
def get_bit_array_size(self, n_items, prob):
'''
Return the size of bit array(m) to used using
following formula
m = -(n * lg(p)) / (lg(2)^2)
n : int
number of items expected to be stored in filter
p : float
False Positive probability in decimal
'''
bit_size = -(n_items * math.log(prob))/(math.log(2)**2)
return int(bit_size)
@classmethod
def get_number_of_hash_functions(self, bit_size, n_items):
'''
Return the hash function(k) to be used using
following formula
k = (m/n) * lg(2)
m : int
size of bit array
n : int
number of items expected to be stored in filter
'''
n_hash_functions = (bit_size/n_items) * math.log(2)
return int(n_hash_functions)