Skip to content

Commit 8cbd4b0

Browse files
authored
Merge pull request #1 from yourbasic/tip
Tip
2 parents 6e23ebd + 281c5d8 commit 8cbd4b0

5 files changed

Lines changed: 138 additions & 32 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Golang Bloom filter implementation
44

55
![Neutral density filter](ND-filter.jpg)
66

7-
*Neutral density filter, image by [Robert Emperley][re], [CC BY-SA 2.0][ccbysa].*
7+
*Image by [Robert Emperley][re], [CC BY-SA 2.0][ccbysa].*
88

99
### Installation
1010

example_test.go

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
"strconv"
88
)
99

10-
// Create and use a Bloom filter.
10+
// Build a blacklist of shady websites.
1111
func Example_basics() {
1212
// Create a Bloom filter with room for 10000 elements
1313
// at a false-positives rate less than 0.5 percent.
@@ -26,27 +26,49 @@ func Example_basics() {
2626
// Output: https://rascal.com seems to be shady.
2727
}
2828

29-
// Count the number of false positives.
29+
// Estimate the number of false positives.
3030
func Example_falsePositives() {
3131
// Create a Bloom filter with room for n elements
3232
// at a false-positives rate less than 1/p.
33-
n := 1000
34-
p := 100
33+
n, p := 10000, 100
3534
filter := bloom.New(n, p)
3635

3736
// Add n random strings.
3837
for i := 0; i < n; i++ {
39-
filter.Add(strconv.FormatUint(rand.Uint64(), 10))
38+
filter.Add(strconv.Itoa(rand.Int()))
4039
}
4140

4241
// Do n random lookups and count the (mostly accidental) hits.
4342
// It shouldn't be much more than n/p, and hopefully less.
4443
count := 0
4544
for i := 0; i < n; i++ {
46-
if filter.Test(strconv.FormatUint(rand.Uint64(), 10)) {
45+
if filter.Test(strconv.Itoa(rand.Int())) {
4746
count++
4847
}
4948
}
5049
fmt.Println(count, "mistakes were made.")
51-
// Output: 1 mistakes were made.
50+
// Output: 26 mistakes were made.
51+
}
52+
53+
// Compute the union of two filters.
54+
func ExampleFilter_Union() {
55+
// Create two Bloom filters, each with room for 1000 elements
56+
// at a false-positives rate less than 1/100.
57+
n, p := 1000, 100
58+
f1 := bloom.New(n, p)
59+
f2 := bloom.New(n, p)
60+
61+
// Add "0", "2", …, "498" to f1
62+
for i := 0; i < n/2; i += 2 {
63+
f1.Add(strconv.Itoa(i))
64+
}
65+
66+
// Add "1", "3", …, "499" to f2
67+
for i := 1; i < n/2; i += 2 {
68+
f2.Add(strconv.Itoa(i))
69+
}
70+
71+
// Compute the approximate size of f1 ∪ f2.
72+
fmt.Println("f1 ∪ f2:", f1.Union(f2).Count())
73+
// Output: f1 ∪ f2: 505
5274
}

filter.go

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
//
33
// Bloom filters
44
//
5-
// A Bloom filter is a space-efficient probabilistic data structure
6-
// used to test set membership. A member test returns either
5+
// A Bloom filter is a fast and space-efficient probabilistic data structure
6+
// used to test set membership.
7+
//
8+
// A membership test returns either
79
// ”likely member” or ”definitely not a member”. Only false positives
810
// can occur: an element that has been added to the filter
911
// will be identified as ”likely member”.
@@ -31,8 +33,8 @@
3133
//
3234
// This implementation is not intended for cryptographic use.
3335
// Each membership test makes a single call to a 128-bit MurmurHash3 function.
34-
// This saves on hashing without increasing the false-positives
35-
// probability as shown by Kirsch and Mitzenmacher.
36+
// This improves speed without increasing the false-positives rate
37+
// as shown by Kirsch and Mitzenmacher.
3638
//
3739
package bloom
3840

@@ -49,23 +51,24 @@ const (
4951
type Filter struct {
5052
data []uint64 // Bit array, the length is a power of 2.
5153
lookups int // Lookups per query
52-
count int64 // Estimated number of unique elements
54+
count int64 // Estimate number of elements
5355
}
5456

57+
// MurmurHash3 function.
5558
var murmur = new(digest)
5659

5760
// New creates an empty Bloom filter with room for n elements
5861
// at a false-positives rate less than 1/p.
5962
func New(n int, p int) *Filter {
60-
f := &Filter{}
6163
minWords := int(0.0325 * math.Log(float64(p)) * float64(n))
6264
words := 1
6365
for words < minWords {
6466
words *= 2
6567
}
66-
f.data = make([]uint64, words)
67-
f.lookups = int(1.4*math.Log(float64(p)) + 1)
68-
return f
68+
return &Filter{
69+
data: make([]uint64, words),
70+
lookups: int(1.4*math.Log(float64(p)) + 1),
71+
}
6972
}
7073

7174
// AddByte adds b to the filter and tells if b was already a likely member.
@@ -95,7 +98,7 @@ func (f *Filter) Add(s string) bool {
9598
return f.AddByte(b)
9699
}
97100

98-
// TestByte tells if b is a likely member of this filter.
101+
// TestByte tells if b is a likely member of the filter.
99102
func (f *Filter) TestByte(b []byte) bool {
100103
h1, h2 := murmur.hash(b)
101104
trunc := uint64(len(f.data))<<shift - 1
@@ -110,14 +113,56 @@ func (f *Filter) TestByte(b []byte) bool {
110113
return true
111114
}
112115

113-
// Test tells if s is a likely member of this filter.
116+
// Test tells if s is a likely member of the filter.
114117
func (f *Filter) Test(s string) bool {
115118
b := make([]byte, len(s))
116119
copy(b, s)
117120
return f.TestByte(b)
118121
}
119122

120-
// Count returns an estimate of the number of unique elements added to this filter.
123+
// Count returns an estimate of the number of elements in the filter.
121124
func (f *Filter) Count() int64 {
122125
return f.count
123126
}
127+
128+
// Union returns a new Bloom filter that consists of all elements
129+
// that belong to either f1 or f2. The two filters must be of
130+
// the same size n and have the same false-positives rate p.
131+
//
132+
// The resulting filter is the same as the filter created
133+
// from scratch using the union of the two sets.
134+
func (f1 *Filter) Union(f2 *Filter) *Filter {
135+
if len(f1.data) != len(f2.data) || f1.lookups != f2.lookups {
136+
panic("operation requires filters of the same type")
137+
}
138+
len := len(f1.data)
139+
res := &Filter{
140+
data: make([]uint64, len),
141+
lookups: f1.lookups,
142+
}
143+
bitCount := 0
144+
for i := 0; i < len; i++ {
145+
w := f1.data[i] | f2.data[i]
146+
res.data[i] = w
147+
bitCount += count(w)
148+
}
149+
// Estimate the number of elements from the bitCount.
150+
m := 64 * float64(len)
151+
n := m / float64(f1.lookups) * math.Log(m/(m-float64(bitCount)))
152+
res.count = int64(n)
153+
return res
154+
}
155+
156+
// count returns the number of nonzero bits in w.
157+
func count(w uint64) int {
158+
// Adapted from github.com/yourbasic/bit/funcs.go.
159+
const maxw = 1<<64 - 1
160+
const bpw = 64
161+
w -= (w >> 1) & (maxw / 3)
162+
w = w&(maxw/15*3) + (w>>2)&(maxw/15*3)
163+
w += w >> 4
164+
w &= maxw / 255 * 15
165+
w *= maxw / 255
166+
w >>= (bpw/8 - 1) * 8
167+
return int(w)
168+
}

filter_test.go

Lines changed: 48 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ func TestFilter(t *testing.T) {
4848

4949
member = filter.Add(s3)
5050
if member {
51-
t.Errorf("Add(s1) = %v; want false\n", member)
51+
t.Errorf("Add(s3) = %v; want false\n", member)
5252
}
5353
count = filter.Count()
5454
if count != 2 {
@@ -58,23 +58,52 @@ func TestFilter(t *testing.T) {
5858
}
5959
}
6060

61+
func TestUnion(t *testing.T) {
62+
s1 := "asöldkgjaösldkgaösldkasldgjkaösldkgjöasgkdjg"
63+
s2 := "elasödlnkgaölsdkfgaölsdkjfaölsdkgaölskgnaösl"
64+
s3 := "aölsdgkaösldkgaösldkgjaölsdkjgaölsdkgjaösldk"
65+
for n := 0; n < 100; n++ {
66+
for p := 1; p <= 128; p *= 2 {
67+
f1, f2 := New(n, p), New(n, p)
68+
f1.Add(s1)
69+
f1.Add(s2)
70+
f2.Add(s2)
71+
f2.Add(s3)
72+
or := f1.Union(f2)
73+
member := or.Test(s1)
74+
if !member {
75+
t.Errorf("f1.Union(f2).Test(s1) = %v; want true\n", member)
76+
}
77+
member = or.Test(s2)
78+
if !member {
79+
t.Errorf("f1.Union(f2).Test(s2) = %v; want true\n", member)
80+
}
81+
member = or.Test(s3)
82+
if !member {
83+
t.Errorf("f1.Union(f2).Test(s3) = %v; want true\n", member)
84+
}
85+
}
86+
}
87+
}
88+
89+
var fox string = "The quick brown fox jumps over the lazy dog."
90+
6191
func BenchmarkAdd(b *testing.B) {
6292
b.StopTimer()
6393
filter := New(1<<30, 200)
6494
b.StartTimer()
65-
s := "The quick brown fox jumps over the lazy dog."
6695
for i := 0; i < b.N; i++ {
67-
filter.Add(s)
96+
_ = filter.Add(fox)
6897
}
6998
}
7099

71100
func BenchmarkAddByte(b *testing.B) {
72101
b.StopTimer()
73102
filter := New(1<<30, 200)
74103
b.StartTimer()
75-
s := []byte("The quick brown fox jumps over the lazy dog.")
104+
bytes := []byte(fox)
76105
for i := 0; i < b.N; i++ {
77-
filter.AddByte(s)
106+
_ = filter.AddByte(bytes)
78107
}
79108
}
80109

@@ -83,16 +112,27 @@ func BenchmarkTest(b *testing.B) {
83112
filter := New(1<<30, 200)
84113
b.StartTimer()
85114
for i := 0; i < b.N; i++ {
86-
filter.Test("The quick brown fox jumps over the lazy dog.")
115+
_ = filter.Test(fox)
87116
}
88117
}
89118

90119
func BenchmarkTestByte(b *testing.B) {
91120
b.StopTimer()
92121
filter := New(1<<30, 200)
93122
b.StartTimer()
94-
s := []byte("The quick brown fox jumps over the lazy dog.")
123+
bytes := []byte(fox)
124+
for i := 0; i < b.N; i++ {
125+
_ = filter.TestByte(bytes)
126+
}
127+
}
128+
129+
func BenchmarkTestUnion(b *testing.B) {
130+
n := 1000
131+
b.StopTimer()
132+
f1 := New(n, 200)
133+
f2 := New(n, 200)
134+
b.StartTimer()
95135
for i := 0; i < b.N; i++ {
96-
filter.TestByte(s)
136+
_ = f1.Union(f2)
97137
}
98138
}

hash_test.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@ import (
77
func TestHash(t *testing.T) {
88
d := new(digest)
99
var data = []struct {
10-
h1 uint64
11-
h2 uint64
12-
s string
10+
h1, h2 uint64
11+
s string
1312
}{
1413
{0x0000000000000000, 0x0000000000000000, ""},
1514
{0xcbd8a7b341bd9b02, 0x5b1e906a48ae1d19, "hello"},
@@ -23,7 +22,7 @@ func TestHash(t *testing.T) {
2322
t.Errorf("hash(%q).h1 = %d; want %d\n", x.s, h1, x.h1)
2423
}
2524
if h2 != x.h2 {
26-
t.Errorf("hash(%q).h1 = %d; want %d\n", x.s, h2, x.h2)
25+
t.Errorf("hash(%q).h2 = %d; want %d\n", x.s, h2, x.h2)
2726
}
2827
}
2928
}

0 commit comments

Comments
 (0)