Merge pull request #2 from yourbasic/tip

korthaj · web-flow · commit 3d3d3d05b147 · 2017-05-16T16:24:39.000+02:00
Tip
diff --git a/README.md b/README.md
@@ -1,6 +1,10 @@
 # Your basic Bloom filter
 
-Golang Bloom filter implementation
+### Golang probabilistic set data structure
+
+A Bloom filter is a fast and space-efficient probabilistic
+data structure used to test set membership. A membership test
+returns either ”likely member” or ”definitely not a member”.
 
 ![Neutral density filter](ND-filter.jpg)
 
@@ -23,12 +27,14 @@ There is an online reference for the package at
 * The API of this library is frozen.
 * Version numbers adhere to [semantic versioning][sv].
 
-The only accepted reason to modify the API of this package is to handle
-bug fixes that can't be resolved in any other reasonable way.
+The only accepted reason to modify the API of this package
+is to handle issues that can't be resolved in any other
+reasonable way.
 
 ### Thanks
 
-Thanks to [Sébastien Paolacci][sp] for his excellent MurmurHash implementation.
+Thanks to [Sébastien Paolacci][sp] for his excellent
+MurmurHash implementation.
 
 Stefan Nilsson – [korthaj](https://github.com/korthaj)
 
diff --git a/filter.go b/filter.go
@@ -54,8 +54,11 @@ type Filter struct {
 	count   int64    // Estimate number of elements
 }
 
-// MurmurHash3 function.
-var murmur = new(digest)
+// MurmurHash3 functions.
+var (
+	murmur       = new(digest)
+	murmurString = new(digestString)
+)
 
 // New creates an empty Bloom filter with room for n elements
 // at a false-positives rate less than 1/p.
@@ -73,7 +76,15 @@ func New(n int, p int) *Filter {
 
 // AddByte adds b to the filter and tells if b was already a likely member.
 func (f *Filter) AddByte(b []byte) bool {
-	h1, h2 := murmur.hash(b)
+	return f.add(murmur.hash(b))
+}
+
+// Add adds s to the filter and tells if s was already a likely member.
+func (f *Filter) Add(s string) bool {
+	return f.add(murmurString.hash(s))
+}
+
+func (f *Filter) add(h1, h2 uint64) bool {
 	trunc := uint64(len(f.data))<<shift - 1
 	member := true
 	for i := f.lookups; i > 0; i-- {
@@ -91,16 +102,17 @@ func (f *Filter) AddByte(b []byte) bool {
 	return member
 }
 
-// Add adds s to the filter and tells if s was already a likely member.
-func (f *Filter) Add(s string) bool {
-	b := make([]byte, len(s))
-	copy(b, s)
-	return f.AddByte(b)
-}
-
 // TestByte tells if b is a likely member of the filter.
 func (f *Filter) TestByte(b []byte) bool {
-	h1, h2 := murmur.hash(b)
+	return f.test(murmur.hash(b))
+}
+
+// Test tells if s is a likely member of the filter.
+func (f *Filter) Test(s string) bool {
+	return f.test(murmurString.hash(s))
+}
+
+func (f *Filter) test(h1, h2 uint64) bool {
 	trunc := uint64(len(f.data))<<shift - 1
 	for i := f.lookups; i > 0; i-- {
 		h1 += h2
@@ -113,13 +125,6 @@ func (f *Filter) TestByte(b []byte) bool {
 	return true
 }
 
-// Test tells if s is a likely member of the filter.
-func (f *Filter) Test(s string) bool {
-	b := make([]byte, len(s))
-	copy(b, s)
-	return f.TestByte(b)
-}
-
 // Count returns an estimate of the number of elements in the filter.
 func (f *Filter) Count() int64 {
 	return f.count
diff --git a/filter_test.go b/filter_test.go
@@ -126,7 +126,7 @@ func BenchmarkTestByte(b *testing.B) {
 	}
 }
 
-func BenchmarkTestUnion(b *testing.B) {
+func BenchmarkUnion(b *testing.B) {
 	n := 1000
 	b.StopTimer()
 	f1 := New(n, 200)
diff --git a/hash.go b/hash.go
@@ -1,9 +1,5 @@
 package bloom
 
-import (
-	"encoding/binary"
-)
-
 // MurmurHash3 implementation adapted from Sébastien Paolacci
 // github.com/spaolacci/murmur3, released under BSD-3-Clause.
 
@@ -20,20 +16,24 @@ const (
 )
 
 type digest struct {
-	clen int      // Digested input cumulative length.
-	buf  [16]byte // Expected (but not required) to be 16 large.
-	tail []byte   // 0 to 15 bytes view of buf.
-	h1   uint64   // Running hash part 1.
-	h2   uint64   // Running hash part 2.
+	clen int
+	tail []byte
+	h1   uint64
+	h2   uint64
+}
+
+func Uint64(b []byte) uint64 {
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 
 func (d *digest) bmix(p []byte) (tail []byte) {
 	h1, h2 := d.h1, d.h2
 	nblocks := len(p) / 16
 	for i := 0; i < nblocks; i++ {
 		j := 16 * i
-		k1 := binary.LittleEndian.Uint64(p[j : j+8])
-		k2 := binary.LittleEndian.Uint64(p[j+8 : j+16])
+		k1 := Uint64(p[j : j+8])
+		k2 := Uint64(p[j+8 : j+16])
 		k1 *= c1
 		k1 = (k1 << 31) | (k1 >> 33)
 		k1 *= c2
diff --git a/hash_string.go b/hash_string.go
@@ -0,0 +1,117 @@
+package bloom
+
+// MurmurHash3 implementation adapted from Sébastien Paolacci
+// github.com/spaolacci/murmur3, released under BSD-3-Clause.
+
+func (d *digestString) hash(data string) (h1 uint64, h2 uint64) {
+	d.h1, d.h2 = 0, 0
+	d.clen = len(data)
+	d.tail = d.bmixString(data)
+	return d.sum()
+}
+
+type digestString struct {
+	clen int
+	tail string
+	h1   uint64
+	h2   uint64
+}
+
+func Uint64String(b string) uint64 {
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func (d *digestString) bmixString(p string) (tail string) {
+	h1, h2 := d.h1, d.h2
+	nblocks := len(p) / 16
+	for i := 0; i < nblocks; i++ {
+		j := 16 * i
+		k1 := Uint64String(p[j : j+8])
+		k2 := Uint64String(p[j+8 : j+16])
+		k1 *= c1
+		k1 = (k1 << 31) | (k1 >> 33)
+		k1 *= c2
+		h1 ^= k1
+		h1 = (h1 << 27) | (h1 >> 37)
+		h1 += h2
+		h1 = h1*5 + 0x52dce729
+		k2 *= c2
+		k2 = (k2 << 33) | (k2 >> 31)
+		k2 *= c1
+		h2 ^= k2
+		h2 = (h2 << 31) | (h2 >> 33)
+		h2 += h1
+		h2 = h2*5 + 0x38495ab5
+	}
+	d.h1, d.h2 = h1, h2
+	return p[nblocks*16:]
+}
+
+func (d *digestString) sum() (h1, h2 uint64) {
+	h1, h2 = d.h1, d.h2
+	var k1, k2 uint64
+	switch len(d.tail) & 15 {
+	case 15:
+		k2 ^= uint64(d.tail[14]) << 48
+		fallthrough
+	case 14:
+		k2 ^= uint64(d.tail[13]) << 40
+		fallthrough
+	case 13:
+		k2 ^= uint64(d.tail[12]) << 32
+		fallthrough
+	case 12:
+		k2 ^= uint64(d.tail[11]) << 24
+		fallthrough
+	case 11:
+		k2 ^= uint64(d.tail[10]) << 16
+		fallthrough
+	case 10:
+		k2 ^= uint64(d.tail[9]) << 8
+		fallthrough
+	case 9:
+		k2 ^= uint64(d.tail[8]) << 0
+		k2 *= c2
+		k2 = (k2 << 33) | (k2 >> 31)
+		k2 *= c1
+		h2 ^= k2
+		fallthrough
+	case 8:
+		k1 ^= uint64(d.tail[7]) << 56
+		fallthrough
+	case 7:
+		k1 ^= uint64(d.tail[6]) << 48
+		fallthrough
+	case 6:
+		k1 ^= uint64(d.tail[5]) << 40
+		fallthrough
+	case 5:
+		k1 ^= uint64(d.tail[4]) << 32
+		fallthrough
+	case 4:
+		k1 ^= uint64(d.tail[3]) << 24
+		fallthrough
+	case 3:
+		k1 ^= uint64(d.tail[2]) << 16
+		fallthrough
+	case 2:
+		k1 ^= uint64(d.tail[1]) << 8
+		fallthrough
+	case 1:
+		k1 ^= uint64(d.tail[0]) << 0
+		k1 *= c1
+		k1 = (k1 << 31) | (k1 >> 33)
+		k1 *= c2
+		h1 ^= k1
+	}
+	h1 ^= uint64(d.clen)
+	h2 ^= uint64(d.clen)
+	h1 += h2
+	h2 += h1
+	h1 = fmix(h1)
+	h2 = fmix(h2)
+	h1 += h2
+	h2 += h1
+	return h1, h2
+}
diff --git a/hash_test.go b/hash_test.go
@@ -26,3 +26,26 @@ func TestHash(t *testing.T) {
 		}
 	}
 }
+
+func TestHashString(t *testing.T) {
+	d := new(digestString)
+	var data = []struct {
+		h1, h2 uint64
+		s      string
+	}{
+		{0x0000000000000000, 0x0000000000000000, ""},
+		{0xcbd8a7b341bd9b02, 0x5b1e906a48ae1d19, "hello"},
+		{0x342fac623a5ebc8e, 0x4cdcbc079642414d, "hello, world"},
+		{0xb89e5988b737affc, 0x664fc2950231b2cb, "19 Jan 2038 at 3:14:07 AM"},
+		{0xcd99481f9ee902c9, 0x695da1a38987b6e7, "The quick brown fox jumps over the lazy dog."},
+	}
+	for _, x := range data {
+		h1, h2 := d.hash(x.s)
+		if h1 != x.h1 {
+			t.Errorf("hash(%q).h1 = %d; want %d\n", x.s, h1, x.h1)
+		}
+		if h2 != x.h2 {
+			t.Errorf("hash(%q).h2 = %d; want %d\n", x.s, h2, x.h2)
+		}
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@ func BenchmarkTestByte(b *testing.B) {`
`126`	`126`	`}`
`127`	`127`	`}`
`128`	`128`
`129`		`-func BenchmarkTestUnion(b *testing.B) {`
	`129`	`+func BenchmarkUnion(b *testing.B) {`
`130`	`130`	`n := 1000`
`131`	`131`	`b.StopTimer()`
`132`	`132`	`f1 := New(n, 200)`