CountingBloom.py 2.86 KB

Raw Blame History Permalink

import hashlib
import math
import random
from fnvhash import fnv1a_32 as fnv
from fnvhash import fnv1_32 as f32
import sys
import threading

class Counting_bloom_filter(object):
    def __init__(self, elem_num, p): # elem_num : 삽입하고자 하는 원소 개수
        self.elem_num = elem_num
        self.positive = p # false-positive 비율
        self.generate_parameter()
        self.init_bf()


    def generate_parameter(self):
        self.length = math.ceil(-(self.elem_num* math.log(self.positive))/((math.log(2)) ** 2)) # 배열 길이
        self.hash_num = math.ceil((self.length/self.elem_num)*math.log(2)) #


    def init_bf(self):
        self.c_bf = [0] * self.length # array의 모든 원소에 대한 count는 0으로 초기화

    def insert_to_cbf(self, val):
        keytable = []
        loc1 = self.hash_func1(val)
        self.c_bf[loc1] += 1
        keytable.append(self.c_bf[loc1])
        loc2 = self.hash_func2(val)
        self.c_bf[loc2] += 1
        keytable.append(self.c_bf[loc2])
        loc3 = self.hash_func3(val)
        self.c_bf[loc3] += 1
        keytable.append(self.c_bf[loc3])
        return min(keytable)

    def add_to_cbf(self, val): # cbf에 원소 삽입
        # 최적의 해시 개수 hash_num에 대하여
        # 3개라 가정하면, i값은 0~2까지
        # 이 i값을 seed값 삼아 동일 sha1함수에 다른 seed값 부여하고 연산 진행
        # 연산 결과 나온 key값이 loc, CBF 배열의 loc번째 인덱스에 increment연산 진행
        # return값: 해당 값(0~threshold값 까지의 정수)
        for i in range(self.hash_num):
            loc = self.hash_func3(val, i+1)
            self.c_bf[loc] += 1
        return self.c_bf[loc]


    def hash_func1(self, val): #fnv32-1a
        hashedVal = fnv(val.encode('utf-8'))
        return hashedVal % self.length


    def hash_func2(self, val): #MD5
        MD5 = hashlib.md5()
        MD5.update(val.encode('utf-8'))
        hashedVal = int(MD5.hexdigest(), 16)
        return hashedVal % self.length


    def hash_func3(self, val): #sha1
        sha1 = hashlib.sha1()
        sha1.update(str(val).encode('utf-8'))
        val =  int(sha1.hexdigest(), 16)
        return val % self.length


def main():
    a = Counting_bloom_filter(100,0.001) # 100개 ip주소를 cbf에 삽입, 원하는 이론상의 최대 false positive 수치: 0.1%
    print(a.length) # 이 때의 배열의 길이
    print(a.insert_to_cbf('172.14.2.48'))
    print(a.hash_func1('172.14.2.48'))
    print(a.hash_func2('172.14.2.48'))
    print(a.hash_func3('172.14.2.48'))
    #sh = hashlib.sha1() # ip주소의 경우에 적절치 못함(리턴하는 값이 너무 큼)
    #re = sh.update('172.14.2.48'.encode('utf-8'))
    #re =  int(sh.hexdigest(), 16) * a.hash_num
    #print(re%a.length)
    #print(a.look_up(10))
    #a.add_to_cbf(10)
if __name__ == '__main__': main()