1package junk
2
3// see https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering
4// - todo: better html parsing?
5// - todo: try reading text in pdf?
6// - todo: try to detect language, have words per language? can be in the same dictionary. currently my dictionary is biased towards treating english as spam.
7
8import (
9 "bufio"
10 "fmt"
11 "io"
12 "os"
13 "strings"
14 "unicode"
15
16 "golang.org/x/net/html"
17
18 "github.com/mjl-/mox/message"
19)
20
21func (f *Filter) tokenizeMail(path string) (bool, map[string]struct{}, error) {
22 mf, err := os.Open(path)
23 if err != nil {
24 return false, nil, err
25 }
26 defer func() {
27 err := mf.Close()
28 f.log.Check(err, "closing message file")
29 }()
30 fi, err := mf.Stat()
31 if err != nil {
32 return false, nil, err
33 }
34 p, _ := message.EnsurePart(f.log.Logger, false, mf, fi.Size())
35 words, err := f.ParseMessage(p)
36 return true, words, err
37}
38
39// ParseMessage reads a mail and returns a map with words.
40func (f *Filter) ParseMessage(p message.Part) (map[string]struct{}, error) {
41 metaWords := map[string]struct{}{}
42 textWords := map[string]struct{}{}
43 htmlWords := map[string]struct{}{}
44
45 hdrs, err := p.Header()
46 if err != nil {
47 return nil, fmt.Errorf("parsing headers: %v", err)
48 }
49
50 // Add words from the header, annotated with <field>+":".
51 // todo: add whether header is dkim-verified?
52 for k, l := range hdrs {
53 for _, h := range l {
54 switch k {
55 case "From", "To", "Cc", "Bcc", "Reply-To", "Subject", "Sender", "Return-Path":
56 // case "Subject", "To":
57 default:
58 continue
59 }
60 words := map[string]struct{}{}
61 f.tokenizeText(strings.NewReader(h), words)
62 for w := range words {
63 if len(w) <= 3 {
64 continue
65 }
66 metaWords[k+":"+w] = struct{}{}
67 }
68 }
69 }
70
71 if err := f.mailParse(p, metaWords, textWords, htmlWords); err != nil {
72 return nil, fmt.Errorf("parsing message: %w", err)
73 }
74
75 for w := range metaWords {
76 textWords[w] = struct{}{}
77 }
78 for w := range htmlWords {
79 textWords[w] = struct{}{}
80 }
81
82 return textWords, nil
83}
84
85// mailParse looks through the mail for the first text and html parts, and tokenizes their words.
86func (f *Filter) mailParse(p message.Part, metaWords, textWords, htmlWords map[string]struct{}) error {
87 ct := p.MediaType + "/" + p.MediaSubType
88
89 if ct == "TEXT/HTML" {
90 err := f.tokenizeHTML(p.ReaderUTF8OrBinary(), metaWords, htmlWords)
91 // log.Printf("html parsed, words %v", htmlWords)
92 return err
93 }
94 if ct == "" || strings.HasPrefix(ct, "TEXT/") {
95 err := f.tokenizeText(p.ReaderUTF8OrBinary(), textWords)
96 // log.Printf("text parsed, words %v", textWords)
97 return err
98 }
99 if p.Message != nil {
100 // Nested message, happens for forwarding.
101 if err := p.SetMessageReaderAt(); err != nil {
102 return fmt.Errorf("setting reader on nested message: %w", err)
103 }
104 return f.mailParse(*p.Message, metaWords, textWords, htmlWords)
105 }
106 for _, sp := range p.Parts {
107 if err := f.mailParse(sp, metaWords, textWords, htmlWords); err != nil {
108 return err
109 }
110 }
111 return nil
112}
113
114func looksRandom(s string) bool {
115 // Random strings, eg 2fvu9stm9yxhnlu. ASCII only and a many consonants in a stretch.
116 stretch := 0
117 const consonants = "bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ23456789" // 0 and 1 may be used as o and l/i
118 stretches := 0
119 for _, c := range s {
120 if c >= 0x80 {
121 return false
122 }
123 if strings.ContainsRune(consonants, c) {
124 stretch++
125 continue
126 }
127 if stretch >= 6 {
128 stretches++
129 }
130 stretch = 0
131 }
132 if stretch >= 6 {
133 stretches++
134 }
135 return stretches > 0
136}
137
138func looksNumeric(s string) bool {
139 s = strings.TrimPrefix(s, "0x") // Hexadecimal.
140 var digits, hex, other, digitstretch, maxdigitstretch int
141 for _, c := range s {
142 if c >= '0' && c <= '9' {
143 digits++
144 digitstretch++
145 continue
146 } else if c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' {
147 hex++
148 } else {
149 other++
150 }
151 if digitstretch > maxdigitstretch {
152 maxdigitstretch = digitstretch
153 }
154 }
155 if digitstretch > maxdigitstretch {
156 maxdigitstretch = digitstretch
157 }
158 return maxdigitstretch >= 4 || other == 0 && maxdigitstretch >= 3
159}
160
161func (f *Filter) tokenizeText(r io.Reader, words map[string]struct{}) error {
162 b := &strings.Builder{}
163 var prev string
164 var prev2 string
165
166 add := func() {
167 defer b.Reset()
168 if b.Len() <= 2 {
169 return
170 }
171
172 s := b.String()
173 s = strings.Trim(s, "'")
174 var nondigit bool
175 for _, c := range s {
176 if !unicode.IsDigit(c) {
177 nondigit = true
178 break
179 }
180 }
181
182 if !(nondigit && len(s) > 2) {
183 return
184 }
185
186 if looksRandom(s) {
187 return
188 }
189 if looksNumeric(s) {
190 return
191 }
192
193 // todo: do something for URLs, parse them? keep their domain only?
194
195 if f.Threegrams && prev2 != "" && prev != "" {
196 words[prev2+" "+prev+" "+s] = struct{}{}
197 }
198 if f.Twograms && prev != "" {
199 words[prev+" "+s] = struct{}{}
200 }
201 if f.Onegrams {
202 words[s] = struct{}{}
203 }
204 prev2 = prev
205 prev = s
206 }
207
208 br := bufio.NewReader(r)
209
210 peekLetter := func() bool {
211 c, _, err := br.ReadRune()
212 br.UnreadRune()
213 return err == nil && unicode.IsLetter(c)
214 }
215
216 for {
217 c, _, err := br.ReadRune()
218 if err == io.EOF {
219 break
220 }
221 if err != nil {
222 return err
223 }
224 if !unicode.IsLetter(c) && !unicode.IsDigit(c) && (c != '\'' || b.Len() > 0 && peekLetter()) {
225 add()
226 } else {
227 b.WriteRune(unicode.ToLower(c))
228 }
229 }
230 add()
231 return nil
232}
233
234// tokenizeHTML parses html, and tokenizes its text into words.
235func (f *Filter) tokenizeHTML(r io.Reader, meta, words map[string]struct{}) error {
236 htmlReader := &htmlTextReader{
237 t: html.NewTokenizer(r),
238 meta: map[string]struct{}{},
239 }
240 return f.tokenizeText(htmlReader, words)
241}
242
243type htmlTextReader struct {
244 t *html.Tokenizer
245 meta map[string]struct{}
246 tagStack []string
247 buf []byte
248 err error
249}
250
251func (r *htmlTextReader) Read(buf []byte) (n int, err error) {
252 // todo: deal with invalid html better. the tokenizer is just tokenizing, we need to fix up the nesting etc. eg, rules say some elements close certain open elements.
253 // todo: deal with inline elements? they shouldn't cause a word break.
254
255 give := func(nbuf []byte) (int, error) {
256 n := len(buf)
257 if n > len(nbuf) {
258 n = len(nbuf)
259 }
260 copy(buf, nbuf[:n])
261 nbuf = nbuf[n:]
262 if len(nbuf) < cap(r.buf) {
263 r.buf = r.buf[:len(nbuf)]
264 } else {
265 r.buf = make([]byte, len(nbuf), 3*len(nbuf)/2)
266 }
267 copy(r.buf, nbuf)
268 return n, nil
269 }
270
271 if len(r.buf) > 0 {
272 return give(r.buf)
273 }
274 if r.err != nil {
275 return 0, r.err
276 }
277
278 for {
279 switch r.t.Next() {
280 case html.ErrorToken:
281 r.err = r.t.Err()
282 return 0, r.err
283 case html.TextToken:
284 if len(r.tagStack) > 0 {
285 switch r.tagStack[len(r.tagStack)-1] {
286 case "script", "style", "svg":
287 continue
288 }
289 }
290 buf := r.t.Text()
291 if len(buf) > 0 {
292 return give(buf)
293 }
294 case html.StartTagToken:
295 tagBuf, moreAttr := r.t.TagName()
296 tag := string(tagBuf)
297 //log.Printf("tag %q %v", tag, r.tagStack)
298
299 if tag == "img" && moreAttr {
300 var key, val []byte
301 for moreAttr {
302 key, val, moreAttr = r.t.TagAttr()
303 if string(key) == "alt" && len(val) > 0 {
304 return give(val)
305 }
306 }
307 }
308
309 // Empty elements, https://developer.mozilla.org/en-US/docs/Glossary/Empty_element
310 switch tag {
311 case "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr":
312 continue
313 }
314
315 r.tagStack = append(r.tagStack, tag)
316 case html.EndTagToken:
317 // log.Printf("tag pop %v", r.tagStack)
318 if len(r.tagStack) > 0 {
319 r.tagStack = r.tagStack[:len(r.tagStack)-1]
320 }
321 case html.SelfClosingTagToken:
322 case html.CommentToken:
323 case html.DoctypeToken:
324 }
325 }
326}
327