10 "github.com/mjl-/mox/message"
11 "github.com/mjl-/mox/mlog"
14// WordSearch holds context for a search, with scratch buffers to prevent
15// allocations for each message.
16type WordSearch struct {
17 words, notWords [][]byte
18 searchBuf, keepBuf []byte
21// PrepareWordSearch returns a search context that can be used to match multiple
22// messages (after each other, not concurrently).
23func PrepareWordSearch(words, notWords []string) WordSearch {
25 for _, w := range words {
26 wl = append(wl, []byte(strings.ToLower(w)))
28 for _, w := range notWords {
29 nwl = append(nwl, []byte(strings.ToLower(w)))
33 for _, w := range words {
38 for _, w := range notWords {
43 keep += 6 // Max utf-8 character size.
46 for bufSize/keep < 8 {
50 keepBuf := make([]byte, keep)
51 searchBuf := make([]byte, bufSize)
53 return WordSearch{wl, nwl, searchBuf, keepBuf}
56// MatchPart returns whether the part/mail message p matches the search.
57// The search terms are matched against content-transfer-decoded and
58// charset-decoded bodies and optionally headers.
59// HTML parts are currently treated as regular text, without parsing HTML.
60func (ws WordSearch) MatchPart(log mlog.Log, p *message.Part, headerToo bool) (bool, error) {
61 seen := map[int]bool{}
62 miss, err := ws.matchPart(log, p, headerToo, seen)
63 match := err == nil && !miss && len(seen) == len(ws.words)
67// If all words are seen, and we there are no not-words that force us to search
68// till the end, we know we have a match.
69func (ws WordSearch) isQuickHit(seen map[int]bool) bool {
70 return len(seen) == len(ws.words) && len(ws.notWords) == 0
73// search a part as text and/or its subparts, recursively. Once we know we have
74// a miss, we stop (either due to not-word match or error). In case of
75// non-miss, the caller checks if there was a hit.
76func (ws WordSearch) matchPart(log mlog.Log, p *message.Part, headerToo bool, seen map[int]bool) (miss bool, rerr error) {
78 miss, err := ws.searchReader(log, p.HeaderReader(), seen)
79 if miss || err != nil || ws.isQuickHit(seen) {
84 if len(p.Parts) == 0 {
85 if p.MediaType != "TEXT" {
86 // todo: for other types we could try to find a library for parsing and search in there too.
89 tp := p.ReaderUTF8OrBinary()
90 // todo: for html and perhaps other types, we could try to parse as text and filter on the text.
91 miss, err := ws.searchReader(log, tp, seen)
92 if miss || err != nil || ws.isQuickHit(seen) {
96 for _, pp := range p.Parts {
97 if pp.Message != nil {
98 if err := pp.SetMessageReaderAt(); err != nil {
103 miss, err := ws.matchPart(log, &pp, headerToo, seen)
104 if miss || err != nil || ws.isQuickHit(seen) {
111func (ws WordSearch) searchReader(log mlog.Log, r io.Reader, seen map[int]bool) (miss bool, rerr error) {
112 // We will be reading through the content, stopping as soon as we known an answer:
113 // when all words have been seen and there are no "not words" (true), or one "not
114 // word" has been seen (false). We use bytes.Contains to look for the words. We
115 // advance our buffer in largish chunks, keeping the end of the buffer the size of
116 // the largest word plus the max of an utf-8 character to account for words
121 n, err := io.ReadFull(r, ws.searchBuf[have:])
125 if err != nil && err != io.EOF && err != io.ErrUnexpectedEOF {
129 copy(ws.keepBuf, ws.searchBuf[have-len(ws.keepBuf):])
132 lower := toLower(ws.searchBuf[:have])
134 for i, w := range ws.words {
135 if !seen[i] && bytes.Contains(lower, w) {
137 if len(seen) == len(ws.words) && len(ws.notWords) == 0 {
142 for _, w := range ws.notWords {
143 if bytes.Contains(lower, w) {
148 // Must be EOF or UnexpectedEOF now.
151 copy(ws.searchBuf, ws.keepBuf)
152 have = len(ws.keepBuf)
157// in-place lower-casing, only allocating a new slice when lower-case would become
158// larger. we replace RuneError (0xfffd) by byte value 0, because it would often
159// increase size, but we assume no one wants to match it.
160func toLower(buf []byte) []byte {
163 for i := 0; i < len(buf); {
166 if b >= 'A' && b <= 'Z' {
173 c, size := utf8.DecodeRune(buf[i:])
175 nc := unicode.ToLower(c)
179 if c == utf8.RuneError {
183 nsize := utf8.RuneLen(nc)
184 // Take care not to overwrite the part of the buffer we still have to process.
185 if !copied && len(r)+nsize > i {
186 // eg Ⱥ 0x23a (2 bytes) to ⱥ 0x2c65 (3 bytes)
188 nr := make([]byte, len(r), len(r)+nsize+len(buf)-i)
192 r = utf8.AppendRune(r, nc)