22//
33// Bloom filters
44//
5- // A Bloom filter is a space-efficient probabilistic data structure
6- // used to test set membership. A member test returns either
5+ // A Bloom filter is a fast and space-efficient probabilistic data structure
6+ // used to test set membership.
7+ //
8+ // A membership test returns either
79// ”likely member” or ”definitely not a member”. Only false positives
810// can occur: an element that has been added to the filter
911// will be identified as ”likely member”.
3133//
3234// This implementation is not intended for cryptographic use.
3335// Each membership test makes a single call to a 128-bit MurmurHash3 function.
34- // This saves on hashing without increasing the false-positives
35- // probability as shown by Kirsch and Mitzenmacher.
36+ // This improves speed without increasing the false-positives rate
37+ // as shown by Kirsch and Mitzenmacher.
3638//
3739package bloom
3840
@@ -49,23 +51,24 @@ const (
4951type Filter struct {
5052 data []uint64 // Bit array, the length is a power of 2.
5153 lookups int // Lookups per query
52- count int64 // Estimated number of unique elements
54+ count int64 // Estimate number of elements
5355}
5456
57+ // MurmurHash3 function.
5558var murmur = new (digest )
5659
5760// New creates an empty Bloom filter with room for n elements
5861// at a false-positives rate less than 1/p.
5962func New (n int , p int ) * Filter {
60- f := & Filter {}
6163 minWords := int (0.0325 * math .Log (float64 (p )) * float64 (n ))
6264 words := 1
6365 for words < minWords {
6466 words *= 2
6567 }
66- f .data = make ([]uint64 , words )
67- f .lookups = int (1.4 * math .Log (float64 (p )) + 1 )
68- return f
68+ return & Filter {
69+ data : make ([]uint64 , words ),
70+ lookups : int (1.4 * math .Log (float64 (p )) + 1 ),
71+ }
6972}
7073
7174// AddByte adds b to the filter and tells if b was already a likely member.
@@ -95,7 +98,7 @@ func (f *Filter) Add(s string) bool {
9598 return f .AddByte (b )
9699}
97100
98- // TestByte tells if b is a likely member of this filter.
101+ // TestByte tells if b is a likely member of the filter.
99102func (f * Filter ) TestByte (b []byte ) bool {
100103 h1 , h2 := murmur .hash (b )
101104 trunc := uint64 (len (f .data ))<< shift - 1
@@ -110,14 +113,56 @@ func (f *Filter) TestByte(b []byte) bool {
110113 return true
111114}
112115
113- // Test tells if s is a likely member of this filter.
116+ // Test tells if s is a likely member of the filter.
114117func (f * Filter ) Test (s string ) bool {
115118 b := make ([]byte , len (s ))
116119 copy (b , s )
117120 return f .TestByte (b )
118121}
119122
120- // Count returns an estimate of the number of unique elements added to this filter.
123+ // Count returns an estimate of the number of elements in the filter.
121124func (f * Filter ) Count () int64 {
122125 return f .count
123126}
127+
128+ // Union returns a new Bloom filter that consists of all elements
129+ // that belong to either f1 or f2. The two filters must be of
130+ // the same size n and have the same false-positives rate p.
131+ //
132+ // The resulting filter is the same as the filter created
133+ // from scratch using the union of the two sets.
134+ func (f1 * Filter ) Union (f2 * Filter ) * Filter {
135+ if len (f1 .data ) != len (f2 .data ) || f1 .lookups != f2 .lookups {
136+ panic ("operation requires filters of the same type" )
137+ }
138+ len := len (f1 .data )
139+ res := & Filter {
140+ data : make ([]uint64 , len ),
141+ lookups : f1 .lookups ,
142+ }
143+ bitCount := 0
144+ for i := 0 ; i < len ; i ++ {
145+ w := f1 .data [i ] | f2 .data [i ]
146+ res .data [i ] = w
147+ bitCount += count (w )
148+ }
149+ // Estimate the number of elements from the bitCount.
150+ m := 64 * float64 (len )
151+ n := m / float64 (f1 .lookups ) * math .Log (m / (m - float64 (bitCount )))
152+ res .count = int64 (n )
153+ return res
154+ }
155+
156+ // count returns the number of nonzero bits in w.
157+ func count (w uint64 ) int {
158+ // Adapted from github.com/yourbasic/bit/funcs.go.
159+ const maxw = 1 << 64 - 1
160+ const bpw = 64
161+ w -= (w >> 1 ) & (maxw / 3 )
162+ w = w & (maxw / 15 * 3 ) + (w >> 2 )& (maxw / 15 * 3 )
163+ w += w >> 4
164+ w &= maxw / 255 * 15
165+ w *= maxw / 255
166+ w >>= (bpw / 8 - 1 ) * 8
167+ return int (w )
168+ }
0 commit comments