1package junk
2
3import (
4 "errors"
5 "os"
6
7 "golang.org/x/crypto/blake2b"
8)
9
10// see https://en.wikipedia.org/wiki/Bloom_filter
11
12var errWidth = errors.New("k and width wider than 256 bits and width not more than 32")
13var errPowerOfTwo = errors.New("data not a power of two")
14
15// Bloom is a bloom filter.
16type Bloom struct {
17 data []byte
18 k int // Number of bits we store/lookup in the bloom filter per value.
19 w int // Number of bits needed to address a single bit position.
20 modified bool
21}
22
23func bloomWidth(fileSize int) int {
24 w := 0
25 for bits := uint32(fileSize * 8); bits > 1; bits >>= 1 {
26 w++
27 }
28 return w
29}
30
31// BloomValid returns an error if the bloom file parameters are not correct.
32func BloomValid(fileSize int, k int) error {
33 _, err := bloomValid(fileSize, k)
34 return err
35}
36
37func bloomValid(fileSize, k int) (int, error) {
38 w := bloomWidth(fileSize)
39 if 1<<w != fileSize*8 {
40 return 0, errPowerOfTwo
41 }
42 if k*w > 256 || w > 32 {
43 return 0, errWidth
44 }
45 return w, nil
46}
47
48// NewBloom returns a bloom filter with given initial data.
49//
50// The number of bits in data must be a power of 2.
51// K is the number of "hashes" (bits) to store/lookup for each value stored.
52// Width is calculated as the number of bits needed to represent a single bit/hash
53// position in the data.
54//
55// For each value stored/looked up, a hash over the value is calculated. The hash
56// is split into "k" values that are "width" bits wide, each used to lookup a bit.
57// K * width must not exceed 256.
58func NewBloom(data []byte, k int) (*Bloom, error) {
59 w, err := bloomValid(len(data), k)
60 if err != nil {
61 return nil, err
62 }
63
64 return &Bloom{
65 data: data,
66 k: k,
67 w: w,
68 }, nil
69}
70
71func (b *Bloom) Add(s string) {
72 h := hash([]byte(s), b.w)
73 for i := 0; i < b.k; i++ {
74 b.set(h.nextPos())
75 }
76}
77
78func (b *Bloom) Has(s string) bool {
79 h := hash([]byte(s), b.w)
80 for i := 0; i < b.k; i++ {
81 if !b.has(h.nextPos()) {
82 return false
83 }
84 }
85 return true
86}
87
88func (b *Bloom) Bytes() []byte {
89 return b.data
90}
91
92func (b *Bloom) Modified() bool {
93 return b.modified
94}
95
96// Ones returns the number of ones.
97func (b *Bloom) Ones() (n int) {
98 for _, d := range b.data {
99 for i := 0; i < 8; i++ {
100 if d&1 != 0 {
101 n++
102 }
103 d >>= 1
104 }
105 }
106 return n
107}
108
109func (b *Bloom) Write(path string) error {
110 f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0660)
111 if err != nil {
112 return err
113 }
114 if _, err := f.Write(b.data); err != nil {
115 f.Close()
116 return err
117 }
118 if err := f.Close(); err != nil {
119 return err
120 }
121 b.modified = false
122 return nil
123}
124
125func (b *Bloom) has(p int) bool {
126 v := b.data[p>>3] >> (7 - (p & 7))
127 return v&1 != 0
128}
129
130func (b *Bloom) set(p int) {
131 by := p >> 3
132 bi := p & 0x7
133 var v byte = 1 << (7 - bi)
134 if b.data[by]&v == 0 {
135 b.data[by] |= v
136 b.modified = true
137 }
138}
139
140type bits struct {
141 width int // Number of bits for each position.
142 buf []byte // Remaining bytes to use for next position.
143 cur uint64 // Bits to read next position from. Replenished from buf.
144 ncur int // Number of bits available in cur. We consume the highest bits first.
145}
146
147func hash(v []byte, width int) *bits {
148 buf := blake2b.Sum256(v)
149 return &bits{width: width, buf: buf[:]}
150}
151
152// nextPos returns the next bit position.
153func (b *bits) nextPos() (v int) {
154 if b.width > b.ncur {
155 for len(b.buf) > 0 && b.ncur < 64-8 {
156 b.cur <<= 8
157 b.cur |= uint64(b.buf[0])
158 b.ncur += 8
159 b.buf = b.buf[1:]
160 }
161 }
162 v = int((b.cur >> (b.ncur - b.width)) & ((1 << b.width) - 1))
163 b.ncur -= b.width
164 return v
165}
166