1package junk
2
3import (
4 "errors"
5 "os"
6
7 "golang.org/x/crypto/blake2b"
8
9 "github.com/mjl-/mox/mlog"
10)
11
12// see https://en.wikipedia.org/wiki/Bloom_filter
13
14var errWidth = errors.New("k and width wider than 256 bits and width not more than 32")
15var errPowerOfTwo = errors.New("data not a power of two")
16
17// Bloom is a bloom filter.
18type Bloom struct {
19 data []byte
20 k int // Number of bits we store/lookup in the bloom filter per value.
21 w int // Number of bits needed to address a single bit position.
22 modified bool
23
24 log mlog.Log // For cid logging.
25}
26
27func bloomWidth(fileSize int) int {
28 w := 0
29 for bits := uint32(fileSize * 8); bits > 1; bits >>= 1 {
30 w++
31 }
32 return w
33}
34
35// BloomValid returns an error if the bloom file parameters are not correct.
36func BloomValid(fileSize int, k int) error {
37 _, err := bloomValid(fileSize, k)
38 return err
39}
40
41func bloomValid(fileSize, k int) (int, error) {
42 w := bloomWidth(fileSize)
43 if 1<<w != fileSize*8 {
44 return 0, errPowerOfTwo
45 }
46 if k*w > 256 || w > 32 {
47 return 0, errWidth
48 }
49 return w, nil
50}
51
52// NewBloom returns a bloom filter with given initial data.
53//
54// The number of bits in data must be a power of 2.
55// K is the number of "hashes" (bits) to store/lookup for each value stored.
56// Width is calculated as the number of bits needed to represent a single bit/hash
57// position in the data.
58//
59// For each value stored/looked up, a hash over the value is calculated. The hash
60// is split into "k" values that are "width" bits wide, each used to lookup a bit.
61// K * width must not exceed 256.
62func NewBloom(log mlog.Log, data []byte, k int) (*Bloom, error) {
63 w, err := bloomValid(len(data), k)
64 if err != nil {
65 return nil, err
66 }
67
68 return &Bloom{
69 data: data,
70 k: k,
71 w: w,
72 log: log,
73 }, nil
74}
75
76func (b *Bloom) Add(s string) {
77 h := hash([]byte(s), b.w)
78 for range b.k {
79 b.set(h.nextPos())
80 }
81}
82
83func (b *Bloom) Has(s string) bool {
84 h := hash([]byte(s), b.w)
85 for range b.k {
86 if !b.has(h.nextPos()) {
87 return false
88 }
89 }
90 return true
91}
92
93func (b *Bloom) Bytes() []byte {
94 return b.data
95}
96
97func (b *Bloom) Modified() bool {
98 return b.modified
99}
100
101// Ones returns the number of ones.
102func (b *Bloom) Ones() (n int) {
103 for _, d := range b.data {
104 for range 8 {
105 if d&1 != 0 {
106 n++
107 }
108 d >>= 1
109 }
110 }
111 return n
112}
113
114func (b *Bloom) Write(path string) error {
115 f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0660)
116 if err != nil {
117 return err
118 }
119 if _, err := f.Write(b.data); err != nil {
120 xerr := f.Close()
121 b.log.Check(xerr, "closing bloom file after write failed")
122 return err
123 }
124 if err := f.Close(); err != nil {
125 return err
126 }
127 b.modified = false
128 return nil
129}
130
131func (b *Bloom) has(p int) bool {
132 v := b.data[p>>3] >> (7 - (p & 7))
133 return v&1 != 0
134}
135
136func (b *Bloom) set(p int) {
137 by := p >> 3
138 bi := p & 0x7
139 var v byte = 1 << (7 - bi)
140 if b.data[by]&v == 0 {
141 b.data[by] |= v
142 b.modified = true
143 }
144}
145
146type bits struct {
147 width int // Number of bits for each position.
148 buf []byte // Remaining bytes to use for next position.
149 cur uint64 // Bits to read next position from. Replenished from buf.
150 ncur int // Number of bits available in cur. We consume the highest bits first.
151}
152
153func hash(v []byte, width int) *bits {
154 buf := blake2b.Sum256(v)
155 return &bits{width: width, buf: buf[:]}
156}
157
158// nextPos returns the next bit position.
159func (b *bits) nextPos() (v int) {
160 if b.width > b.ncur {
161 for len(b.buf) > 0 && b.ncur < 64-8 {
162 b.cur <<= 8
163 b.cur |= uint64(b.buf[0])
164 b.ncur += 8
165 b.buf = b.buf[1:]
166 }
167 }
168 v = int((b.cur >> (b.ncur - b.width)) & ((1 << b.width) - 1))
169 b.ncur -= b.width
170 return v
171}
172