1package smtpserver
2
3import (
4 "errors"
5 "fmt"
6 "log/slog"
7 "strings"
8 "time"
9
10 "github.com/mjl-/bstore"
11
12 "github.com/mjl-/mox/dns"
13 "github.com/mjl-/mox/mlog"
14 "github.com/mjl-/mox/smtp"
15 "github.com/mjl-/mox/store"
16)
17
18type reputationMethod string
19
20const (
21 methodMsgfromFull reputationMethod = "msgfromfull"
22 methodMsgtoFull reputationMethod = "msgtofull"
23 methodMsgfromDomain reputationMethod = "msgfromdomain"
24 methodMsgfromOrgDomain reputationMethod = "msgfromorgdomain"
25 methodMsgtoDomain reputationMethod = "msgtodomain"
26 methodMsgtoOrgDomain reputationMethod = "msgtoorgdomain"
27 methodDKIMSPF reputationMethod = "dkimspf"
28 methodIP1 reputationMethod = "ip1"
29 methodIP2 reputationMethod = "ip2"
30 methodIP3 reputationMethod = "ip3"
31 methodNone reputationMethod = "none"
32)
33
34// Reputation returns whether message m is likely junk.
35//
36// This function is called after checking for a manually configured spf mailfrom
37// allow (e.g. for mailing lists), and after checking for a dmarc reject policy.
38//
39// The decision is made based on historic messages delivered to the same
40// destination mailbox, MailboxOrigID. Because each mailbox may have a different
41// accept policy. We only use messages that have been marked as either junk or
42// non-junk. We help users by automatically marking them as non-junk when moving to
43// certain folders in the default config (e.g. the archive folder). We expect users
44// to mark junk messages as such when they read it. And to keep it in their inbox,
45// regular trash or archive if it is not.
46//
47// The basic idea is to keep accepting messages that were accepted in the past, and
48// keep rejecting those that were rejected. This is relatively easy to check if
49// mail passes SPF and/or DKIM with Message-From alignment. Regular email from
50// known people will be let in. But spammers are trickier. They will use new IPs,
51// (sub)domains, no or newly created SPF and/or DKIM identifiers, new localparts,
52// etc. This function likely ends up returning "inconclusive" for such emails. The
53// junkfilter will have to take care of a final decision.
54//
55// In case of doubt, it doesn't hurt much to accept another mail that a user has
56// communicated successfully with in the past. If the most recent message is marked
57// as junk that could have happened accidentally. If another message is let in, and
58// it is again junk, future messages will be rejected.
59//
60// Actual spammers will probably try to use identifiers, i.e. (sub)domain, dkim/spf
61// identifiers and ip addresses for which we have no history. We may only have
62// ip-based reputation, perhaps only an ip range, perhaps nothing.
63//
64// Some profiles of first-time senders:
65//
66// - Individuals. They can typically get past the junkfilter if needed.
67// - Transactional emails. They should get past the junkfilter. If they use one of
68// the larger email service providers, their reputation could help. If the
69// junkfilter rejects the message, users can recover the message from the Rejects
70// mailbox. The first message is typically initiated by a user, e.g. by registering.
71// - Desired commercial email will have to get past the junkfilter based on its
72// content. There will typically be earlier communication with the (organizational)
73// domain that would let the message through.
74// - Mailing list. May get past the junkfilter. If delivery is to a separate
75// mailbox, the junkfilter will let it in because of little history. Long enough to
76// build reputation based on DKIM/SPF signals. Users are best off to
77// configure accept rules for messages from mailing lists.
78//
79// The decision-making process looks at historic messages. The following properties
80// are checked until matching messages are found. If they are found, a decision is
81// returned, which may be inconclusive. The next property on the list is only
82// checked if a step did not match any messages.
83//
84// - Messages matching full "message from" address, either with strict/relaxed
85// dkim/spf-verification, or without.
86// - Messages the user sent to the "message from" address.
87// - Messages matching only the domain of the "message from" address (different
88// localpart), again with verification or without.
89// - Messages sent to an address in the domain of the "message from" address.
90// - The previous two checks again, but now checking against the organizational
91// domain instead of the exact domain.
92// - Matching DKIM domains and a matching SPF mailfrom, or mailfrom domain, or ehlo
93// domain.
94// - "Exact" IP, or nearby IPs.
95//
96// References:
97// ../rfc/5863
98// ../rfc/7960
99// ../rfc/6376:1915
100// ../rfc/6376:3716
101// ../rfc/7208:2167
102func reputation(tx *bstore.Tx, log mlog.Log, m *store.Message, smtputf8 bool) (rjunk *bool, rconclusive bool, rmethod reputationMethod, reasonText string, rerr error) {
103 boolptr := func(v bool) *bool {
104 return &v
105 }
106 xfalse := boolptr(false)
107 xtrue := boolptr(true)
108
109 type queryError string
110
111 defer func() {
112 x := recover()
113 if x == nil {
114 return
115 }
116 if xerr, ok := x.(queryError); ok {
117 rerr = errors.New(string(xerr))
118 return
119 }
120 panic(x)
121 }()
122
123 now := time.Now()
124
125 // messageQuery returns a base query for historic seen messages to the same
126 // mailbox, at most maxAge old, and at most maxCount messages.
127 messageQuery := func(fm *store.Message, maxAge time.Duration, maxCount int) *bstore.Query[store.Message] {
128 q := bstore.QueryTx[store.Message](tx)
129 q.FilterEqual("MailboxOrigID", m.MailboxID)
130 q.FilterEqual("Expunged", false)
131 q.FilterFn(func(m store.Message) bool {
132 return m.Junk || m.Notjunk
133 })
134 if fm != nil {
135 q.FilterNonzero(*fm)
136 }
137 q.FilterGreaterEqual("Received", now.Add(-maxAge))
138 q.Limit(maxCount)
139 q.SortDesc("Received")
140 return q
141 }
142
143 // Execute the query, returning messages or returning error through panic.
144 xmessageList := func(q *bstore.Query[store.Message], descr string) []store.Message {
145 t0 := time.Now()
146 l, err := q.List()
147 log.Debugx("querying messages for reputation", err,
148 slog.Int("msgs", len(l)),
149 slog.String("descr", descr),
150 slog.Duration("queryduration", time.Since(t0)))
151 if err != nil {
152 panic(queryError(fmt.Sprintf("listing messages: %v", err)))
153 }
154 return l
155 }
156
157 xrecipientExists := func(q *bstore.Query[store.Recipient]) bool {
158 exists, err := q.Exists()
159 if err != nil {
160 panic(queryError(fmt.Sprintf("checking for recipient: %v", err)))
161 }
162 return exists
163 }
164
165 const year = 365 * 24 * time.Hour
166
167 // Look for historic messages with same "message from" address. We'll
168 // treat any validation (strict/dmarc/relaxed) the same, but "none"
169 // separately.
170 //
171 // We only need 1 message, and sometimes look at a second message. If
172 // the last message or the message before was an accept, we accept. If
173 // the single last or last two were a reject, we reject.
174 //
175 // If there was no validation, any signal is inconclusive.
176 if m.MsgFromDomain != "" {
177 q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain}, 3*year, 2)
178 q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
179 msgs := xmessageList(q, "mgsfromfull")
180 if len(msgs) > 0 {
181 // todo: we may want to look at dkim/spf in this case.
182 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
183 conclusive := m.MsgFromValidated
184 return &spam, conclusive, methodMsgfromFull, "reputation of exact message-from address", nil
185 }
186 if !m.MsgFromValidated {
187 // Look for historic messages that were validated. If present, this is likely spam.
188 // Only return as conclusively spam if history also says this From-address sent
189 // spam.
190 q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain, MsgFromValidated: true}, 3*year, 2)
191 msgs = xmessageList(q, "msgfromfull-validated")
192 if len(msgs) > 0 {
193 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
194 return xtrue, spam, methodMsgfromFull, "unvalidated message with validated historic messages with exact message-from address", nil
195 }
196 }
197
198 // Look if we ever sent to this address. If so, we accept,
199 qr := bstore.QueryTx[store.Recipient](tx)
200 qr.FilterEqual("Localpart", m.MsgFromLocalpart)
201 qr.FilterEqual("Domain", m.MsgFromDomain)
202 qr.FilterGreaterEqual("Sent", now.Add(-3*year))
203 if xrecipientExists(qr) {
204 return xfalse, true, methodMsgtoFull, "exact message-from address was earlier message recipient", nil
205 }
206
207 // Look for domain match, then for organizational domain match.
208 for _, orgdomain := range []bool{false, true} {
209 qm := store.Message{}
210 var method reputationMethod
211 var source, descr string
212 if orgdomain {
213 qm.MsgFromOrgDomain = m.MsgFromOrgDomain
214 method = methodMsgfromOrgDomain
215 source = "organizational domain of message-from address"
216 descr = "msgfromorgdomain"
217 } else {
218 qm.MsgFromDomain = m.MsgFromDomain
219 method = methodMsgfromDomain
220 source = "exact domain of message-from address"
221 descr = "msgfromdomain"
222 }
223
224 q := messageQuery(&qm, 2*year, 20)
225 q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
226 msgs := xmessageList(q, descr)
227 if len(msgs) > 0 {
228 nonjunk := 0
229 for _, m := range msgs {
230 if !m.Junk {
231 nonjunk++
232 }
233 }
234 if 100*nonjunk/len(msgs) > 80 {
235 reasonText = fmt.Sprintf("positive reputation with %s based on %d messages", source, len(msgs))
236 return xfalse, true, method, reasonText, nil
237 }
238 if nonjunk == 0 {
239 // Only conclusive with at least 3 different localparts.
240 localparts := map[smtp.Localpart]struct{}{}
241 for _, m := range msgs {
242 localparts[m.MsgFromLocalpart] = struct{}{}
243 if len(localparts) == 3 {
244 reasonText = fmt.Sprintf("negative reputation of at least 3 addresses with %s based on %d messages", source, len(msgs))
245 return xtrue, true, method, reasonText, nil
246 }
247 }
248 reasonText = fmt.Sprintf("negative reputation with %s based on %d messages", source, len(msgs))
249 return xtrue, false, method, reasonText, nil
250 }
251 // Mixed signals from domain. We don't want to block a new sender.
252 reasonText = fmt.Sprintf("mixed signals with %s based on %d messages", source, len(msgs))
253 return nil, false, method, reasonText, nil
254 }
255 if !m.MsgFromValidated {
256 // Look for historic messages that were validated. If present, this is likely spam.
257 // Only return as conclusively spam if history also says this From-address sent
258 // spam.
259 q := messageQuery(&qm, 2*year, 2)
260 q.FilterEqual("MsgFromValidated", true)
261 msgs = xmessageList(q, descr+"-validated")
262 if len(msgs) > 0 {
263 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
264 reasonText = fmt.Sprintf("unvalidated message with %s while we have validated messages from that source", source)
265 return xtrue, spam, method, reasonText, nil
266 }
267 }
268
269 // Look if we ever sent to this address. If so, we accept,
270 qr := bstore.QueryTx[store.Recipient](tx)
271 if orgdomain {
272 qr.FilterEqual("OrgDomain", m.MsgFromOrgDomain)
273 method = methodMsgtoOrgDomain
274 source = "organizational domain of message-from address"
275 } else {
276 qr.FilterEqual("Domain", m.MsgFromDomain)
277 method = methodMsgtoDomain
278 source = "exact domain of message-from address"
279 }
280 qr.FilterGreaterEqual("Sent", now.Add(-2*year))
281 if xrecipientExists(qr) {
282 reasonText = fmt.Sprintf("%s was recipient address", source)
283 return xfalse, true, method, reasonText, nil
284 }
285 }
286 }
287
288 // DKIM and SPF.
289 // We only use identities that passed validation. Failed identities are ignored. ../rfc/6376:2447
290 // todo future: we could do something with the DKIM identity (i=) field if it is more specific than just the domain (d=).
291 dkimspfsignals := []float64{}
292 dkimspfreasondoms := []string{}
293 dkimspfmsgs := 0
294 for _, dom := range m.DKIMDomains {
295 q := messageQuery(nil, year/2, 50)
296 q.FilterIn("DKIMDomains", dom)
297 msgs := xmessageList(q, "dkimdomain")
298 if len(msgs) > 0 {
299 nspam := 0
300 for _, m := range msgs {
301 if m.Junk {
302 nspam++
303 }
304 }
305 pspam := float64(nspam) / float64(len(msgs))
306 dkimspfsignals = append(dkimspfsignals, pspam)
307 dkimspfreasondoms = append(dkimspfreasondoms, dom)
308 dkimspfmsgs = len(msgs)
309 }
310 }
311 if m.MailFromValidated || m.EHLOValidated {
312 var dom string
313 var msgs []store.Message
314 if m.MailFromValidated && m.MailFromDomain != "" {
315 dom = m.MailFromDomain
316 q := messageQuery(&store.Message{MailFromLocalpart: m.MailFromLocalpart, MailFromDomain: m.MailFromDomain}, year/2, 50)
317 msgs = xmessageList(q, "mailfrom")
318 if len(msgs) == 0 {
319 q := messageQuery(&store.Message{MailFromDomain: m.MailFromDomain}, year/2, 50)
320 msgs = xmessageList(q, "mailfromdomain")
321 }
322 }
323 if len(msgs) == 0 && m.EHLOValidated && m.EHLODomain != "" {
324 dom = m.EHLODomain
325 q := messageQuery(&store.Message{EHLODomain: m.EHLODomain}, year/2, 50)
326 msgs = xmessageList(q, "ehlodomain")
327 }
328 if len(msgs) > 0 {
329 nspam := 0
330 for _, m := range msgs {
331 if m.Junk {
332 nspam++
333 }
334 }
335 pspam := float64(nspam) / float64(len(msgs))
336 dkimspfsignals = append(dkimspfsignals, pspam)
337 dkimspfreasondoms = append(dkimspfreasondoms, dom)
338 if len(msgs) > dkimspfmsgs {
339 dkimspfmsgs = len(msgs)
340 }
341 }
342 }
343 if len(dkimspfsignals) > 0 {
344 var nham, nspam int
345 var hamdoms, spamdoms []string
346 for i, p := range dkimspfsignals {
347 d, _ := dns.ParseDomain(dkimspfreasondoms[i])
348 if p < .1 {
349 nham++
350 hamdoms = append(hamdoms, d.XName(smtputf8))
351 } else if p > .9 {
352 nspam++
353 spamdoms = append(spamdoms, d.XName(smtputf8))
354 }
355 }
356 if nham > 0 && nspam == 0 {
357 reasonText = fmt.Sprintf("positive dkim/spf reputation for domain(s) %s", strings.Join(hamdoms, ","))
358 return xfalse, true, methodDKIMSPF, reasonText, nil
359 }
360 if nspam > 0 && nham == 0 {
361 reasonText = fmt.Sprintf("negative dkim/spf reputation for domain(s) %s", strings.Join(hamdoms, ","))
362 return xtrue, dkimspfmsgs > 1, methodDKIMSPF, reasonText, nil
363 }
364 reasonText = fmt.Sprintf("mixed dkim/spf reputation, positive for %s, negative for %s", strings.Join(hamdoms, ","), strings.Join(spamdoms, ","))
365 return nil, false, methodDKIMSPF, reasonText, nil
366 }
367
368 // IP-based. A wider mask needs more messages to be conclusive.
369 // We require the resulting signal to be strong, i.e. likely ham or likely spam.
370 var msgs []store.Message
371 var need int
372 var method reputationMethod
373 var ip string
374 if m.RemoteIPMasked1 != "" {
375 q := messageQuery(&store.Message{RemoteIPMasked1: m.RemoteIPMasked1}, year/4, 50)
376 msgs = xmessageList(q, "ip1")
377 need = 2
378 method = methodIP1
379 ip = m.RemoteIPMasked1
380 }
381 if len(msgs) == 0 && m.RemoteIPMasked2 != "" {
382 q := messageQuery(&store.Message{RemoteIPMasked2: m.RemoteIPMasked2}, year/4, 50)
383 msgs = xmessageList(q, "ip2")
384 need = 5
385 method = methodIP2
386 ip = m.RemoteIPMasked2
387 }
388 if len(msgs) == 0 && m.RemoteIPMasked3 != "" {
389 q := messageQuery(&store.Message{RemoteIPMasked3: m.RemoteIPMasked3}, year/4, 50)
390 msgs = xmessageList(q, "ip3")
391 need = 10
392 method = methodIP3
393 ip = m.RemoteIPMasked3
394 }
395 if len(msgs) > 0 {
396 nspam := 0
397 for _, m := range msgs {
398 if m.Junk {
399 nspam++
400 }
401 }
402 pspam := float64(nspam) / float64(len(msgs))
403 var spam *bool
404 if pspam < .25 {
405 spam = xfalse
406 } else if pspam > .75 {
407 spam = xtrue
408 }
409 conclusive := len(msgs) >= need && (pspam <= 0.1 || pspam >= 0.9)
410 v6 := strings.Contains(m.RemoteIP, ":")
411 reasonText = fmt.Sprintf("reputation for ip %s%s, spam score %.2f", ip, maskclasses[classmask{v6, method}], pspam)
412 return spam, conclusive, method, reasonText, nil
413 }
414
415 return nil, false, methodNone, "no address/spf/dkim/ip reputation", nil
416}
417
418type classmask struct {
419 v6 bool
420 method reputationMethod
421}
422
423var maskclasses = map[classmask]string{
424 {false, methodIP1}: "/32",
425 {false, methodIP2}: "/26",
426 {false, methodIP3}: "/21",
427 {true, methodIP1}: "/64",
428 {true, methodIP2}: "/48",
429 {true, methodIP3}: "/32",
430}
431