Skip to content

Commit 65b0517

Browse files
committed
Data structure
1 parent d00a280 commit 65b0517

11 files changed

Lines changed: 487 additions & 1 deletion

File tree

Readme.md

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
* [Build a NoSQL Database From Scratch in 1000 Lines of Code](https://medium.com/better-programming/build-a-nosql-database-from-the-scratch-in-1000-lines-of-code-8ed1c15ed924)
1010
* [Writing a SQL database from scratch in Go: 1. SELECT, INSERT, CREATE and a REPL](https://notes.eatonphil.com/database-basics.html)
1111

12+
* [Database Engine Development](https://www.youtube.com/playlist?list=PLm7R-cUo29CXVu9a9TzBEwSQ9JPVGmISg)
13+
1214
* https://github.com/cmu-db/bustub
1315

1416
##
@@ -47,14 +49,60 @@
4749

4850
* [An introduction to Conflict-Free Replicated Data Types (CRDTs)](https://www.youtube.com/watch?v=gZP2VUmH05A)
4951

52+
## Data Structure
53+
54+
* [Heaps, heapsort, and priority queues - Inside code](https://www.youtube.com/watch?v=pLIajuc31qk)
55+
* [Trie data structure - Inside code](https://www.youtube.com/watch?v=qA8l8TAMyig)
56+
* [Compressed trie](https://www.youtube.com/watch?v=qakGXuOW1S8)
57+
5058
## Probablistic Data structures
5159

60+
* [Hello Interview : Bloom Filters, Count-Min Sketch, HyperLogLog](https://www.youtube.com/watch?v=IgyU0iFIoqM)
61+
62+
* [Probablistic data structure lectures](https://www.youtube.com/playlist?list=PL2mpR0RYFQsAR5RyB54FyEE9vUiGtCSZM)
63+
64+
### Bloom filter
65+
66+
* [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
67+
* [mCoding : Bloom Filters](https://www.youtube.com/watch?v=qZNJTh2NEiU)
68+
* [Number0 : Bloom Filters](https://www.youtube.com/watch?v=eCUm4U3WDpM)
69+
* [ByteByteGo : Bloom Filters](https://www.youtube.com/watch?v=V3pzxngeLqw)
70+
* [Spanning Tree : What Are Bloom Filters?](https://www.youtube.com/watch?v=kfFacplFY4Y)
71+
* [ByteMonk : Bloom Filters](https://www.youtube.com/watch?v=GT0En1dGntY)
72+
73+
Bloom filter is a space-efficient probabilistic data structure, that is used to test whether an element is a member of a set. False positive matches are possible, but false negatives are not - in other words, a query returns either "possibly in set" or "definitely not in set". Elements can be added to the set, but not removed.
74+
75+
A Bloom filter is a representation of a set of _n_ items, where the main requirement is to make membership queries; _i.e._, whether an item is a member of a set.
76+
77+
#### Uses
78+
##### Cache filtering
79+
Content delivery networks deploy web caches around the world to cache and serve web content to users with greater performance and reliability. A key application of Bloom filters is their use in efficiently determining which web objects to store in these web caches. To prevent caching one-hit-wonders, a Bloom filter is used to keep track of all URLs that are accessed by users.
80+
##### Web Crawler
81+
82+
### HyperLogLog
83+
5284
* [PapersWeLove : HyperLogLog](https://www.youtube.com/watch?v=y3fTaxA8PkU)
53-
* [A problem so hard even Google relies on Random Chance](https://www.youtube.com/watch?v=lJYufx0bfpw)
5485
* [The Algorithm with the Best Name - HyperLogLog Explained](https://www.youtube.com/watch?v=2PlrMCiUN_s)
86+
* [A problem so hard even Google relies on Random Chance](https://www.youtube.com/watch?v=lJYufx0bfpw)
87+
* [Counting BILLIONS with Just Kilobytes](https://www.youtube.com/watch?v=f69hh3KgFEk)
88+
* https://github.com/tylertreat/BoomFilters/blob/master/hyperloglog.go
5589

90+
### Count–min sketch
91+
92+
* [Wikepedia](https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch)
93+
* https://github.com/tylertreat/BoomFilters/blob/master/countmin.go
5694
* [Count-min Sketch](https://www.youtube.com/watch?v=Okdjn7o4q8E)
5795

96+
The goal of the basic version of the count–min sketch is to consume a stream of events, one at a time, and count the frequency of the different types of events in the stream.
97+
98+
### HeavyKeeper TopK
99+
100+
* [Understanding Probabilistic Data Structures](https://www.youtube.com/watch?v=2Dzc7fxA0us)
101+
102+
### T Digest
103+
104+
* [Sketching Data with T Digest](https://www.youtube.com/watch?v=ETUYhEZRtWE)
105+
58106
## Cache
59107

60108
* [TinyLFU: A Highly Efficient Cache Admission Policy](https://dgraph.io/blog/refs/TinyLFU%20-%20A%20Highly%20Efficient%20Cache%20Admission%20Policy.pdf)

datastructure/bloomfilter.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
package datastructure
2+
3+
import (
4+
"errors"
5+
6+
"github.com/zeebo/xxh3"
7+
)
8+
9+
type BloomFilter struct {
10+
bitset []byte
11+
hasher []*xxh3.Hasher
12+
m uint64
13+
k uint64
14+
}
15+
16+
func (b *BloomFilter) setBit(pos uint64) {
17+
b.bitset[pos/8] |= 1 << (pos % 8)
18+
}
19+
20+
func (b *BloomFilter) getBit(pos uint64) bool {
21+
return b.bitset[pos/8]&(1<<(pos%8)) != 0
22+
}
23+
24+
func (b *BloomFilter) Add(data []byte) {
25+
for i := 0; i < int(b.k); i++ {
26+
h := b.hasher[i]
27+
h.Reset()
28+
h.Write(data)
29+
hash := h.Sum64()
30+
b.setBit(hash % b.m)
31+
}
32+
}
33+
34+
func (b *BloomFilter) Check(data []byte) bool {
35+
for i := 0; i < int(b.k); i++ {
36+
h := b.hasher[i]
37+
h.Reset()
38+
h.Write(data)
39+
hash := h.Sum64()
40+
if !b.getBit(hash % b.m) {
41+
return false
42+
}
43+
}
44+
return true
45+
}
46+
47+
func (b *BloomFilter) Union(a *BloomFilter) (err error) {
48+
if b.m != a.m {
49+
return errors.New("the bloom filters have the different sizes")
50+
}
51+
52+
if b.k != a.k {
53+
return errors.New("the bloom filters have the different number of hash functions")
54+
}
55+
56+
for i := uint64(0); i < b.m; i++ {
57+
if b.getBit(i) || a.getBit(i) {
58+
b.setBit(i)
59+
}
60+
}
61+
62+
return nil
63+
}
64+
65+
func NewBloomFilter(m uint64, k uint64) BloomFilter {
66+
hasher := make([]*xxh3.Hasher, k)
67+
for i := range hasher {
68+
h := xxh3.NewSeed(uint64(i))
69+
hasher[i] = h
70+
}
71+
return BloomFilter{
72+
bitset: make([]byte, (m+7)/8),
73+
hasher: hasher,
74+
m: m,
75+
k: k,
76+
}
77+
}

datastructure/bloomfilter_test.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package datastructure
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestBloomFilter(t *testing.T) {
8+
bf := NewBloomFilter(1024, 3) // 1024 bits, 3 hash functions
9+
10+
wordsToAdd := [][]byte{
11+
[]byte("hello"),
12+
[]byte("world"),
13+
[]byte("golang"),
14+
}
15+
16+
wordsNotAdded := [][]byte{
17+
[]byte("python"),
18+
[]byte("java"),
19+
[]byte("rust"),
20+
}
21+
22+
// Add words
23+
for _, word := range wordsToAdd {
24+
bf.Add(word)
25+
}
26+
27+
// Check words that were added
28+
for _, word := range wordsToAdd {
29+
if !bf.Check(word) {
30+
t.Errorf("Expected word %q to be found in BloomFilter", word)
31+
}
32+
}
33+
34+
// Check words that were NOT added
35+
falsePositiveCount := 0
36+
for _, word := range wordsNotAdded {
37+
if bf.Check(word) {
38+
falsePositiveCount++
39+
}
40+
}
41+
42+
// Allow a small false positive rate
43+
if falsePositiveCount > 1 {
44+
t.Errorf("Too many false positives: got %d, want <= 1", falsePositiveCount)
45+
}
46+
}
47+
48+
func BenchmarkBloomFilterAdd(b *testing.B) {
49+
bf := NewBloomFilter(1<<20, 5) // 1 million bits, 5 hash functions
50+
data := []byte("benchmark-data")
51+
52+
b.ResetTimer()
53+
for i := 0; i < b.N; i++ {
54+
bf.Add(data)
55+
}
56+
}
57+
58+
func BenchmarkBloomFilterCheck(b *testing.B) {
59+
bf := NewBloomFilter(1<<20, 5)
60+
data := []byte("benchmark-data")
61+
bf.Add(data)
62+
63+
b.ResetTimer()
64+
for i := 0; i < b.N; i++ {
65+
bf.Check(data)
66+
}
67+
}

datastructure/countminsketch.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package datastructure
2+
3+
import (
4+
"errors"
5+
6+
"github.com/zeebo/xxh3"
7+
)
8+
9+
type CountMinSketch struct {
10+
freq [][]uint64
11+
hasher []*xxh3.Hasher
12+
m uint64
13+
k uint64
14+
count uint64
15+
}
16+
17+
func NewCountMinSketch(m, k uint64) CountMinSketch {
18+
hasher := make([]*xxh3.Hasher, k)
19+
for i := range hasher {
20+
h := xxh3.NewSeed(uint64(i))
21+
hasher[i] = h
22+
}
23+
24+
freq := make([][]uint64, k)
25+
for i := uint64(0); i < k; i++ {
26+
freq[i] = make([]uint64, m) // each row is size m
27+
}
28+
29+
return CountMinSketch{
30+
freq: freq,
31+
hasher: hasher,
32+
m: m,
33+
k: k,
34+
}
35+
}
36+
37+
func (c *CountMinSketch) Add(data []byte) {
38+
for i := uint64(0); i < c.k; i++ {
39+
pos := xxh3.HashSeed(data, i) % c.m
40+
c.freq[i][pos]++
41+
}
42+
c.count++
43+
}
44+
45+
func (c *CountMinSketch) Count(data []byte) uint64 {
46+
min := uint64(^uint64(0)) // max uint64
47+
for i := uint64(0); i < c.k; i++ {
48+
pos := xxh3.HashSeed(data, i) % c.m
49+
count := c.freq[i][pos]
50+
if count < min {
51+
min = count
52+
}
53+
}
54+
return min
55+
}
56+
57+
// Merge combines this CountMinSketch with another. Returns an error if the
58+
// matrix width and depth are not equal.
59+
func (c *CountMinSketch) Merge(other *CountMinSketch) error {
60+
if c.k != other.k {
61+
return errors.New("matrix depth must match")
62+
}
63+
64+
if c.m != other.m {
65+
return errors.New("matrix width must match")
66+
}
67+
68+
for i := uint64(0); i < c.k; i++ {
69+
for j := uint64(0); j < c.m; j++ {
70+
c.freq[i][j] += other.freq[i][j]
71+
}
72+
}
73+
74+
c.count += other.count
75+
return nil
76+
}

datastructure/go.mod

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
module github.com/codeharik/datastructure
2+
3+
go 1.24.0
4+
5+
require (
6+
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
7+
github.com/zeebo/xxh3 v1.0.2 // indirect
8+
)

datastructure/go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4=
2+
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
3+
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
4+
github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=

0 commit comments

Comments
 (0)