1package junk

3// see https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering

4// - todo: better html parsing?

5// - todo: try reading text in pdf?

6// - todo: try to detect language, have words per language? can be in the same dictionary. currently my dictionary is biased towards treating english as spam.

8import (

9 "bufio"

10 "fmt"

11 "io"

12 "os"

13 "strings"

14 "unicode"

16 "golang.org/x/net/html"

18 "github.com/mjl-/mox/message"

19)

21func (f *Filter) tokenizeMail(path string) (bool, map[string]struct{}, error) {

22 mf, err := os.Open(path)

23 if err != nil {

24 return false, nil, err

25 }

26 defer func() {

27 err := mf.Close()

28 f.log.Check(err, "closing message file")

29 }()

30 fi, err := mf.Stat()

31 if err != nil {

32 return false, nil, err

33 }

34 p, _ := message.EnsurePart(f.log.Logger, false, mf, fi.Size())

35 words, err := f.ParseMessage(p)

36 return true, words, err

37}

39// ParseMessage reads a mail and returns a map with words.

40func (f *Filter) ParseMessage(p message.Part) (map[string]struct{}, error) {

41 metaWords := map[string]struct{}{}

42 textWords := map[string]struct{}{}

43 htmlWords := map[string]struct{}{}

45 hdrs, err := p.Header()

46 if err != nil {

47 return nil, fmt.Errorf("parsing headers: %v", err)

48 }

50 // Add words from the header, annotated with <field>+":".

51 // todo: add whether header is dkim-verified?

52 for k, l := range hdrs {

53 for _, h := range l {

54 switch k {

55 case "From", "To", "Cc", "Bcc", "Reply-To", "Subject", "Sender", "Return-Path":

56 // case "Subject", "To":

57 default:

58 continue

59 }

60 words := map[string]struct{}{}

61 f.tokenizeText(strings.NewReader(h), words)

62 for w := range words {

63 if len(w) <= 3 {

64 continue

65 }

66 metaWords[k+":"+w] = struct{}{}

67 }

68 }

69 }

71 if err := f.mailParse(p, metaWords, textWords, htmlWords); err != nil {

72 return nil, fmt.Errorf("parsing message: %w", err)

73 }

75 for w := range metaWords {

76 textWords[w] = struct{}{}

77 }

78 for w := range htmlWords {

79 textWords[w] = struct{}{}

80 }

82 return textWords, nil

83}

85// mailParse looks through the mail for the first text and html parts, and tokenizes their words.

86func (f *Filter) mailParse(p message.Part, metaWords, textWords, htmlWords map[string]struct{}) error {

87 ct := p.MediaType + "/" + p.MediaSubType

89 if ct == "TEXT/HTML" {

90 err := f.tokenizeHTML(p.ReaderUTF8OrBinary(), metaWords, htmlWords)

91 // log.Printf("html parsed, words %v", htmlWords)

92 return err

93 }

94 if ct == "" || strings.HasPrefix(ct, "TEXT/") {

95 err := f.tokenizeText(p.ReaderUTF8OrBinary(), textWords)

96 // log.Printf("text parsed, words %v", textWords)

97 return err

98 }

99 if p.Message != nil {

100 // Nested message, happens for forwarding.

101 if err := p.SetMessageReaderAt(); err != nil {

102 return fmt.Errorf("setting reader on nested message: %w", err)

103 }

104 return f.mailParse(*p.Message, metaWords, textWords, htmlWords)

105 }

106 for _, sp := range p.Parts {

107 if err := f.mailParse(sp, metaWords, textWords, htmlWords); err != nil {

108 return err

109 }

110 }

111 return nil

112}

113

114func looksRandom(s string) bool {

115 // Random strings, eg 2fvu9stm9yxhnlu. ASCII only and a many consonants in a stretch.

116 stretch := 0

117 const consonants = "bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ23456789" // 0 and 1 may be used as o and l/i

118 stretches := 0

119 for _, c := range s {

120 if c >= 0x80 {

121 return false

122 }

123 if strings.ContainsRune(consonants, c) {

124 stretch++

125 continue

126 }

127 if stretch >= 6 {

128 stretches++

129 }

130 stretch = 0

131 }

132 if stretch >= 6 {

133 stretches++

134 }

135 return stretches > 0

136}

137

138func looksNumeric(s string) bool {

139 s = strings.TrimPrefix(s, "0x") // Hexadecimal.

140 var digits, hex, other, digitstretch, maxdigitstretch int

141 for _, c := range s {

142 if c >= '0' && c <= '9' {

143 digits++

144 digitstretch++

145 continue

146 } else if c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' {

147 hex++

148 } else {

149 other++

150 }

151 if digitstretch > maxdigitstretch {

152 maxdigitstretch = digitstretch

153 }

154 }

155 if digitstretch > maxdigitstretch {

156 maxdigitstretch = digitstretch

157 }

158 return maxdigitstretch >= 4 || other == 0 && maxdigitstretch >= 3

159}

160

161func (f *Filter) tokenizeText(r io.Reader, words map[string]struct{}) error {

162 b := &strings.Builder{}

163 var prev string

164 var prev2 string

165

166 add := func() {

167 defer b.Reset()

168 if b.Len() <= 2 {

169 return

170 }

171

172 s := b.String()

173 s = strings.Trim(s, "'")

174 var nondigit bool

175 for _, c := range s {

176 if !unicode.IsDigit(c) {

177 nondigit = true

178 break

179 }

180 }

181

182 if !(nondigit && len(s) > 2) {

183 return

184 }

185

186 if looksRandom(s) {

187 return

188 }

189 if looksNumeric(s) {

190 return

191 }

192

193 // todo: do something for URLs, parse them? keep their domain only?

194

195 if f.Threegrams && prev2 != "" && prev != "" {

196 words[prev2+" "+prev+" "+s] = struct{}{}

197 }

198 if f.Twograms && prev != "" {

199 words[prev+" "+s] = struct{}{}

200 }

201 if f.Onegrams {

202 words[s] = struct{}{}

203 }

204 prev2 = prev

205 prev = s

206 }

207

208 br := bufio.NewReader(r)

209

210 peekLetter := func() bool {

211 c, _, err := br.ReadRune()

212 if err == nil {

213 err = br.UnreadRune()

214 }

215 return err == nil && unicode.IsLetter(c)

216 }

217

218 for {

219 c, _, err := br.ReadRune()

220 if err == io.EOF {

221 break

222 }

223 if err != nil {

224 return err

225 }

226 if !unicode.IsLetter(c) && !unicode.IsDigit(c) && (c != '\'' || b.Len() > 0 && peekLetter()) {

227 add()

228 } else {

229 b.WriteRune(unicode.ToLower(c))

230 }

231 }

232 add()

233 return nil

234}

235

236// tokenizeHTML parses html, and tokenizes its text into words.

237func (f *Filter) tokenizeHTML(r io.Reader, meta, words map[string]struct{}) error {

238 htmlReader := &htmlTextReader{

239 t: html.NewTokenizer(r),

240 meta: map[string]struct{}{},

241 }

242 return f.tokenizeText(htmlReader, words)

243}

244

245type htmlTextReader struct {

246 t *html.Tokenizer

247 meta map[string]struct{}

248 tagStack []string

249 buf []byte

250 err error

251}

252

253func (r *htmlTextReader) Read(buf []byte) (n int, err error) {

254 // todo: deal with invalid html better. the tokenizer is just tokenizing, we need to fix up the nesting etc. eg, rules say some elements close certain open elements.

255 // todo: deal with inline elements? they shouldn't cause a word break.

256

257 give := func(nbuf []byte) (int, error) {

258 n := min(len(buf), len(nbuf))

259 copy(buf, nbuf[:n])

260 nbuf = nbuf[n:]

261 if len(nbuf) < cap(r.buf) {

262 r.buf = r.buf[:len(nbuf)]

263 } else {

264 r.buf = make([]byte, len(nbuf), 3*len(nbuf)/2)

265 }

266 copy(r.buf, nbuf)