直观的说,bloom算法类似一个hash set,用来判断某个元素(key)是否在某个集合中。
和一般的hash set不同的是,这个算法无需存储key的值,对于每个key,只需要k个比特位,每个存储一个标志,用来判断key是否在集合中。
不需要存储key,节省空间
package studio.greeks.gooney.test;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.security.MessageDigest;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author 吴昭
* @version gooney@2019/1/17 22:17
*/
public class BooleanSet<E extends Serializable> implements Set<E>, Serializable {
private AtomicInteger counter = new AtomicInteger(0);
private boolean[] status = new boolean[0x1000];
private int[] EMPTY = new int[0];
private transient MessageDigest digest;
public BooleanSet() {
}
public BooleanSet(MessageDigest digest) {
this.digest = digest;
}
public int size() {
return counter.get();
}
public boolean isEmpty() {
return counter.get() == 0;
}
private int[] serial(Object o){
if(o instanceof Serializable){
try {
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(byteArrayOutputStream);
oos.writeObject(o);
byte[] bytes = byteArrayOutputStream.toByteArray();
byteArrayOutputStream.close();
oos.close();
if(null != digest){
bytes = digest.digest(bytes);
}
int[] result = new int[bytes.length-1];
for (int i = 0; i < bytes.length-1; i++) {
result[i] = ((bytes[i]>>4) & 0x0f)*0x10 + ((bytes[i+1]>>4) & 0x0f);
}
return result;
}catch (IOException ie){
ie.printStackTrace();
}
}
return EMPTY;
}
public boolean contains(Object o) {
if(o instanceof Serializable){
for (int i : serial((E) o)) {
if(!status[i]){
return false;
}
}
return true;
}
return false;
}
public Iterator<E> iterator() {
throw new UnsupportedOperationException();
}
public Object[] toArray() {
throw new UnsupportedOperationException();
}
public <T> T[] toArray(T[] a) {
throw new UnsupportedOperationException();
}
public boolean add(E e) {
if(add(e, status)){
counter.incrementAndGet();
return true;
}
return false;
}
private boolean add(E e, boolean[] temp){
int[] is = serial(e);
assert temp.length == status.length;
for (int i : is) {
temp[i] = true;
}
return is.length==0;
}
public boolean remove(Object o) {
throw new UnsupportedOperationException();
}
public boolean containsAll(Collection c) {
for (Object o : c) {
if(!contains(o))
return false;
return true;
}
return false;
}
public boolean addAll(Collectionextends E> c) {
boolean[] temp = new boolean[status.length];
for (E e : c) {
if(!add(e, temp)){
return false;
}
}
for (int i = 0; i < temp.length; i++) {
status[i] = temp[i];
}
counter.set(counter.get()+c.size());
return true;
}
public boolean retainAll(Collection c) {
throw new UnsupportedOperationException();
}
public boolean removeAll(Collection c) {
throw new UnsupportedOperationException();
}
public void clear() {
counter.set(0);
for (int i = 0; i < status.length; i++) {
status[i] = false;
}
}
public Spliterator<E> spliterator() {
throw new UnsupportedOperationException();
}
}
package studio.greeks.gooney.test;
import java.io.ByteArrayOutputStream;
import java.io.ObjectOutputStream;
import java.security.MessageDigest;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.UUID;
/**
* @author 吴昭
* @version gooney@2019/1/18 0:01
*/
public class SetTest {
public static void main(String[] args) throws Exception {
int testLen = 10000000;
test( new BooleanSet<>(), "未设置HASH算法的BooleanSet测试",testLen);
test( new BooleanSet<>(MessageDigest.getInstance("MD5")), "设置MD5为HASH算法的BooleanSet测试",testLen);
test( new TreeSet<>(), "TreeSet测试",testLen);
test( new HashSet<>(), "HashSet测试",testLen);
}
private static void test(Set set, String msg, int testSize) throws Exception{
System.out.println("\n"+msg+":");
set.add("test");
System.out.println("添加元素:test");
System.out.println("是否包含[hello]:"+set.contains("hello"));
System.out.println("是否包含[tes]:"+set.contains("tes"));
System.out.println("是否包含[test1]:"+set.contains("test1"));
long start = System.currentTimeMillis();
for (int i = 0; i < testSize; i++) {
set.add(UUID.randomUUID().toString());
}
System.out.println("添加"+testSize+"个元素耗时:"+(System.currentTimeMillis() - start));
start = System.currentTimeMillis();
for (int i = 0; i < testSize; i++) {
set.contains(UUID.randomUUID().toString());
}
System.out.println("验证"+testSize+"个元素耗时:"+(System.currentTimeMillis() - start));
System.out.println("对象占用空间长度:"+getObjectSize(set));
}
private static int getObjectSize(Object o) throws Exception {
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(byteArrayOutputStream);
oos.writeObject(o);
return byteArrayOutputStream.size();
}
}
未设置HASH算法的BooleanSet测试:
添加元素:test
是否包含[hello]:false
是否包含[tes]:true
是否包含[test1]:false
添加10000000个元素耗时:14329
验证10000000个元素耗时:13174
对象占用空间长度:4383
设置MD5为HASH算法的BooleanSet测试:
添加元素:test
是否包含[hello]:false
是否包含[tes]:false
是否包含[test1]:false
添加10000000个元素耗时:14652
验证10000000个元素耗时:14892
对象占用空间长度:4383
TreeSet测试:
添加元素:test
是否包含[hello]:false
是否包含[tes]:false
是否包含[test1]:false
添加10000000个元素耗时:32461
验证10000000个元素耗时:31456
对象占用空间长度:390000053
HashSet测试:
添加元素:test
是否包含[hello]:false
是否包含[tes]:false
是否包含[test1]:false
添加10000000个元素耗时:17864
验证10000000个元素耗时:12508
对象占用空间长度:390000060
参考文章:## 布隆过滤器(Bloom Filter)详解
全部评论