哈希表这种数据结构能够非常快速的查找元素
但它的缺点是空间效率不高
位图提高了空间效率
但位图只能用来判断元素是否存在
关于位图的实现,在这里
http://zhweizhi.blog.51cto.com/10800691/1784383
下面简单介绍一下布隆过滤器。
现在,
假如通过哈希算法,将一个字符串转化成int类型的数据,
通过简单的线性探测(为什么不用其他方式,比如二次探测或者开链法呢?应该还是考虑了效率问题)存放在位图中,就能实现对字符串的查找了。
不过考虑到哈希冲突,用一个哈希算法肯定是不够的。
可以考虑用多个哈希算法,将转化出的int类型数据映射到多个地方。
这样,在查找时,在通过之前用过的若干个哈希算法查找相应位置是否为‘1‘
如果均为‘1‘,那么很大概率上,这个字符串是存在的,
(因此布隆过滤器的判断存在并不是百分之百靠谱的,存在误判的概率,特别是随着元素数量越来越接近容量的时候,误判概率也会越来越高)
如果有一个为‘0‘,那么就能肯定这个字符串是不存在的
实现的代码如下:
我采用了5个哈希算法,可以在网上找,找一些存活率高的。
然后将他们用仿函数实现:
struct __HashFunc1
{
size_t SDBMHash(char *str)
{
size_t hash = 0;
while (*str)
{
hash = (*str++) + (hash << 6) + (hash << 16) - hash;
}
return (hash & 0x7FFFFFFF);
}
size_t operator()(const string &key)
{
return (SDBMHash((char*)key.c_str()));
}
};
struct __HashFunc2
{
unsigned int RSHash(char *str)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
while (*str)
{
hash = hash * a + (*str++);
a *= b;
}
return (hash & 0x7FFFFFFF);
}
size_t operator()(const string &key)
{
return (RSHash((char*)key.c_str()));
}
};
struct __HashFunc3
{
unsigned int RSHash(char *str)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
while (*str)
{
hash = hash * a + (*str++);
a *= b;
}
return (hash & 0x7FFFFFFF);
}
size_t operator()(const string &key)
{
return (RSHash((char*)key.c_str()));
}
};
struct __HashFunc4
{
unsigned int JSHash(char *str)
{
unsigned int hash = 1315423911;
while (*str)
{
hash ^= ((hash << 5) + (*str++) + (hash >> 2));
}
return (hash & 0x7FFFFFFF);
}
size_t operator()(const string &key)
{
return (JSHash((char*)key.c_str()));
}
};
struct __HashFunc5
{
unsigned int PJWHash(char *str)
{
unsigned int BitsInUnignedInt = (unsigned int)(sizeof(unsigned int) * 8);
unsigned int ThreeQuarters = (unsigned int)((BitsInUnignedInt * 3) / 4);
unsigned int OneEighth = (unsigned int)(BitsInUnignedInt / 8);
unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
unsigned int hash = 0;
unsigned int test = 0;
while (*str)
{
hash = (hash << OneEighth) + (*str++);
if ((test = hash & HighBits) != 0)
{
hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return (hash & 0x7FFFFFFF);
}
size_t operator()(const string &key)
{
return (PJWHash((char*)key.c_str()));
}
};布隆过滤器:
//布隆过滤器
template<
class HashFunc1 = __HashFunc1,
class HashFunc2 = __HashFunc2,
class HashFunc3 = __HashFunc3,
class HashFunc4 = __HashFunc4,
class HashFunc5 = __HashFunc5
>
class BloomFillter
{
public:
BloomFillter(size_t n)
:_capacity(_GetNextPrime(n))
,_bm(_capacity)
{}
void Set(const string &key)
{
size_t hash1 = HashFunc1()(key);
size_t hash2 = HashFunc2()(key);
size_t hash3 = HashFunc3()(key);
size_t hash4 = HashFunc4()(key);
size_t hash5 = HashFunc5()(key);
_bm.Set(hash1 % _capacity);
_bm.Set(hash2 % _capacity);
_bm.Set(hash3 % _capacity);
_bm.Set(hash4 % _capacity);
_bm.Set(hash5 % _capacity);
}
bool Test(const string &key)
{
size_t hash1 = HashFunc1()(key);
if (!_bm.Test(hash1 % _capacity))
{
return false;
}
size_t hash2 = HashFunc2()(key);
if (!_bm.Test(hash2 % _capacity))
{
return false;
}
size_t hash3 = HashFunc3()(key);
if (!_bm.Test(hash3 % _capacity))
{
return false;
}
size_t hash4 = HashFunc4()(key);
if (!_bm.Test(hash4 % _capacity))
{
return false;
}
size_t hash5 = HashFunc5()(key);
if (!_bm.Test(hash5 % _capacity))
{
return false;
}
return true;
}
protected:
size_t _GetNextPrime(size_t n)
{
const int _PrimeSize = 28;
static const unsigned long _PrimeList[_PrimeSize] =
{
53ul, 97ul, 193ul, 389ul, 769ul,
1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
1610612741ul, 3221225473ul, 4294967291ul
};
for (int i = 0; i < _PrimeSize; ++i)
{
if (_PrimeList[i] > n)
{
return _PrimeList[i];
}
}
return n;
}
protected:
size_t _capacity;
BitMap _bm;
};原文地址:http://zhweizhi.blog.51cto.com/10800691/1784384