1package smtpserver
2
3import (
4 "errors"
5 "fmt"
6 "log/slog"
7 "time"
8
9 "github.com/mjl-/bstore"
10
11 "github.com/mjl-/mox/mlog"
12 "github.com/mjl-/mox/smtp"
13 "github.com/mjl-/mox/store"
14)
15
16type reputationMethod string
17
18const (
19 methodMsgfromFull reputationMethod = "msgfromfull"
20 methodMsgtoFull reputationMethod = "msgtofull"
21 methodMsgfromDomain reputationMethod = "msgfromdomain"
22 methodMsgfromOrgDomain reputationMethod = "msgfromorgdomain"
23 methodMsgtoDomain reputationMethod = "msgtodomain"
24 methodMsgtoOrgDomain reputationMethod = "msgtoorgdomain"
25 methodDKIMSPF reputationMethod = "dkimspf"
26 methodIP1 reputationMethod = "ip1"
27 methodIP2 reputationMethod = "ip2"
28 methodIP3 reputationMethod = "ip3"
29 methodNone reputationMethod = "none"
30)
31
32// Reputation returns whether message m is likely junk.
33//
34// This function is called after checking for a manually configured spf mailfrom
35// allow (e.g. for mailing lists), and after checking for a dmarc reject policy.
36//
37// The decision is made based on historic messages delivered to the same
38// destination mailbox, MailboxOrigID. Because each mailbox may have a different
39// accept policy. We only use messages that have been marked as either junk or
40// non-junk. We help users by automatically marking them as non-junk when moving to
41// certain folders in the default config (e.g. the archive folder). We expect users
42// to mark junk messages as such when they read it. And to keep it in their inbox,
43// regular trash or archive if it is not.
44//
45// The basic idea is to keep accepting messages that were accepted in the past, and
46// keep rejecting those that were rejected. This is relatively easy to check if
47// mail passes SPF and/or DKIM with Message-From alignment. Regular email from
48// known people will be let in. But spammers are trickier. They will use new IPs,
49// (sub)domains, no or newly created SPF and/or DKIM identifiers, new localparts,
50// etc. This function likely ends up returning "inconclusive" for such emails. The
51// junkfilter will have to take care of a final decision.
52//
53// In case of doubt, it doesn't hurt much to accept another mail that a user has
54// communicated successfully with in the past. If the most recent message is marked
55// as junk that could have happened accidentally. If another message is let in, and
56// it is again junk, future messages will be rejected.
57//
58// Actual spammers will probably try to use identifiers, i.e. (sub)domain, dkim/spf
59// identifiers and ip addresses for which we have no history. We may only have
60// ip-based reputation, perhaps only an ip range, perhaps nothing.
61//
62// Some profiles of first-time senders:
63//
64// - Individuals. They can typically get past the junkfilter if needed.
65// - Transactional emails. They should get past the junkfilter. If they use one of
66// the larger email service providers, their reputation could help. If the
67// junkfilter rejects the message, users can recover the message from the Rejects
68// mailbox. The first message is typically initiated by a user, e.g. by registering.
69// - Desired commercial email will have to get past the junkfilter based on its
70// content. There will typically be earlier communication with the (organizational)
71// domain that would let the message through.
72// - Mailing list. May get past the junkfilter. If delivery is to a separate
73// mailbox, the junkfilter will let it in because of little history. Long enough to
74// build reputation based on DKIM/SPF signals. Users are best off to
75// configure accept rules for messages from mailing lists.
76//
77// The decision-making process looks at historic messages. The following properties
78// are checked until matching messages are found. If they are found, a decision is
79// returned, which may be inconclusive. The next property on the list is only
80// checked if a step did not match any messages.
81//
82// - Messages matching full "message from" address, either with strict/relaxed
83// dkim/spf-verification, or without.
84// - Messages the user sent to the "message from" address.
85// - Messages matching only the domain of the "message from" address (different
86// localpart), again with verification or without.
87// - Messages sent to an address in the domain of the "message from" address.
88// - The previous two checks again, but now checking against the organizational
89// domain instead of the exact domain.
90// - Matching DKIM domains and a matching SPF mailfrom, or mailfrom domain, or ehlo
91// domain.
92// - "Exact" IP, or nearby IPs.
93//
94// References:
95// ../rfc/5863
96// ../rfc/7960
97// ../rfc/6376:1915
98// ../rfc/6376:3716
99// ../rfc/7208:2167
100func reputation(tx *bstore.Tx, log mlog.Log, m *store.Message) (rjunk *bool, rconclusive bool, rmethod reputationMethod, rerr error) {
101 boolptr := func(v bool) *bool {
102 return &v
103 }
104 xfalse := boolptr(false)
105 xtrue := boolptr(true)
106
107 type queryError string
108
109 defer func() {
110 x := recover()
111 if x == nil {
112 return
113 }
114 if xerr, ok := x.(queryError); ok {
115 rerr = errors.New(string(xerr))
116 return
117 }
118 panic(x)
119 }()
120
121 now := time.Now()
122
123 // messageQuery returns a base query for historic seen messages to the same
124 // mailbox, at most maxAge old, and at most maxCount messages.
125 messageQuery := func(fm *store.Message, maxAge time.Duration, maxCount int) *bstore.Query[store.Message] {
126 q := bstore.QueryTx[store.Message](tx)
127 q.FilterEqual("MailboxOrigID", m.MailboxID)
128 q.FilterEqual("Expunged", false)
129 q.FilterFn(func(m store.Message) bool {
130 return m.Junk || m.Notjunk
131 })
132 if fm != nil {
133 q.FilterNonzero(*fm)
134 }
135 q.FilterGreaterEqual("Received", now.Add(-maxAge))
136 q.Limit(maxCount)
137 q.SortDesc("Received")
138 return q
139 }
140
141 // Execute the query, returning messages or returning error through panic.
142 xmessageList := func(q *bstore.Query[store.Message], descr string) []store.Message {
143 t0 := time.Now()
144 l, err := q.List()
145 log.Debugx("querying messages for reputation", err,
146 slog.Int("msgs", len(l)),
147 slog.String("descr", descr),
148 slog.Duration("queryduration", time.Since(t0)))
149 if err != nil {
150 panic(queryError(fmt.Sprintf("listing messages: %v", err)))
151 }
152 return l
153 }
154
155 xrecipientExists := func(q *bstore.Query[store.Recipient]) bool {
156 exists, err := q.Exists()
157 if err != nil {
158 panic(queryError(fmt.Sprintf("checking for recipient: %v", err)))
159 }
160 return exists
161 }
162
163 const year = 365 * 24 * time.Hour
164
165 // Look for historic messages with same "message from" address. We'll
166 // treat any validation (strict/dmarc/relaxed) the same, but "none"
167 // separately.
168 //
169 // We only need 1 message, and sometimes look at a second message. If
170 // the last message or the message before was an accept, we accept. If
171 // the single last or last two were a reject, we reject.
172 //
173 // If there was no validation, any signal is inconclusive.
174 if m.MsgFromDomain != "" {
175 q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain}, 3*year, 2)
176 q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
177 msgs := xmessageList(q, "mgsfromfull")
178 if len(msgs) > 0 {
179 // todo: we may want to look at dkim/spf in this case.
180 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
181 conclusive := m.MsgFromValidated
182 return &spam, conclusive, methodMsgfromFull, nil
183 }
184 if !m.MsgFromValidated {
185 // Look for historic messages that were validated. If present, this is likely spam.
186 // Only return as conclusively spam if history also says this From-address sent
187 // spam.
188 q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain, MsgFromValidated: true}, 3*year, 2)
189 msgs = xmessageList(q, "msgfromfull-validated")
190 if len(msgs) > 0 {
191 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
192 return xtrue, spam, methodMsgfromFull, nil
193 }
194 }
195
196 // Look if we ever sent to this address. If so, we accept,
197 qr := bstore.QueryTx[store.Recipient](tx)
198 qr.FilterEqual("Localpart", m.MsgFromLocalpart)
199 qr.FilterEqual("Domain", m.MsgFromDomain)
200 qr.FilterGreaterEqual("Sent", now.Add(-3*year))
201 if xrecipientExists(qr) {
202 return xfalse, true, methodMsgtoFull, nil
203 }
204
205 // Look for domain match, then for organizational domain match.
206 for _, orgdomain := range []bool{false, true} {
207 qm := store.Message{}
208 var method reputationMethod
209 var descr string
210 if orgdomain {
211 qm.MsgFromOrgDomain = m.MsgFromOrgDomain
212 method = methodMsgfromOrgDomain
213 descr = "msgfromorgdomain"
214 } else {
215 qm.MsgFromDomain = m.MsgFromDomain
216 method = methodMsgfromDomain
217 descr = "msgfromdomain"
218 }
219
220 q := messageQuery(&qm, 2*year, 20)
221 q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
222 msgs := xmessageList(q, descr)
223 if len(msgs) > 0 {
224 nonjunk := 0
225 for _, m := range msgs {
226 if !m.Junk {
227 nonjunk++
228 }
229 }
230 if 100*nonjunk/len(msgs) > 80 {
231 return xfalse, true, method, nil
232 }
233 if nonjunk == 0 {
234 // Only conclusive with at least 3 different localparts.
235 localparts := map[smtp.Localpart]struct{}{}
236 for _, m := range msgs {
237 localparts[m.MsgFromLocalpart] = struct{}{}
238 if len(localparts) == 3 {
239 return xtrue, true, method, nil
240 }
241 }
242 return xtrue, false, method, nil
243 }
244 // Mixed signals from domain. We don't want to block a new sender.
245 return nil, false, method, nil
246 }
247 if !m.MsgFromValidated {
248 // Look for historic messages that were validated. If present, this is likely spam.
249 // Only return as conclusively spam if history also says this From-address sent
250 // spam.
251 q := messageQuery(&qm, 2*year, 2)
252 q.FilterEqual("MsgFromValidated", true)
253 msgs = xmessageList(q, descr+"-validated")
254 if len(msgs) > 0 {
255 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
256 return xtrue, spam, method, nil
257 }
258 }
259
260 // Look if we ever sent to this address. If so, we accept,
261 qr := bstore.QueryTx[store.Recipient](tx)
262 if orgdomain {
263 qr.FilterEqual("OrgDomain", m.MsgFromOrgDomain)
264 method = methodMsgtoOrgDomain
265 } else {
266 qr.FilterEqual("Domain", m.MsgFromDomain)
267 method = methodMsgtoDomain
268 }
269 qr.FilterGreaterEqual("Sent", now.Add(-2*year))
270 if xrecipientExists(qr) {
271 return xfalse, true, method, nil
272 }
273 }
274 }
275
276 // DKIM and SPF.
277 // We only use identities that passed validation. Failed identities are ignored. ../rfc/6376:2447
278 // todo future: we could do something with the DKIM identity (i=) field if it is more specific than just the domain (d=).
279 dkimspfsignals := []float64{}
280 dkimspfmsgs := 0
281 for _, dom := range m.DKIMDomains {
282 q := messageQuery(nil, year/2, 50)
283 q.FilterIn("DKIMDomains", dom)
284 msgs := xmessageList(q, "dkimdomain")
285 if len(msgs) > 0 {
286 nspam := 0
287 for _, m := range msgs {
288 if m.Junk {
289 nspam++
290 }
291 }
292 pspam := float64(nspam) / float64(len(msgs))
293 dkimspfsignals = append(dkimspfsignals, pspam)
294 dkimspfmsgs = len(msgs)
295 }
296 }
297 if m.MailFromValidated || m.EHLOValidated {
298 var msgs []store.Message
299 if m.MailFromValidated && m.MailFromDomain != "" {
300 q := messageQuery(&store.Message{MailFromLocalpart: m.MailFromLocalpart, MailFromDomain: m.MailFromDomain}, year/2, 50)
301 msgs = xmessageList(q, "mailfrom")
302 if len(msgs) == 0 {
303 q := messageQuery(&store.Message{MailFromDomain: m.MailFromDomain}, year/2, 50)
304 msgs = xmessageList(q, "mailfromdomain")
305 }
306 }
307 if len(msgs) == 0 && m.EHLOValidated && m.EHLODomain != "" {
308 q := messageQuery(&store.Message{EHLODomain: m.EHLODomain}, year/2, 50)
309 msgs = xmessageList(q, "ehlodomain")
310 }
311 if len(msgs) > 0 {
312 nspam := 0
313 for _, m := range msgs {
314 if m.Junk {
315 nspam++
316 }
317 }
318 pspam := float64(nspam) / float64(len(msgs))
319 dkimspfsignals = append(dkimspfsignals, pspam)
320 if len(msgs) > dkimspfmsgs {
321 dkimspfmsgs = len(msgs)
322 }
323 }
324 }
325 if len(dkimspfsignals) > 0 {
326 var nham, nspam int
327 for _, p := range dkimspfsignals {
328 if p < .1 {
329 nham++
330 } else if p > .9 {
331 nspam++
332 }
333 }
334 if nham > 0 && nspam == 0 {
335 return xfalse, true, methodDKIMSPF, nil
336 }
337 if nspam > 0 && nham == 0 {
338 return xtrue, dkimspfmsgs > 1, methodDKIMSPF, nil
339 }
340 return nil, false, methodDKIMSPF, nil
341 }
342
343 // IP-based. A wider mask needs more messages to be conclusive.
344 // We require the resulting signal to be strong, i.e. likely ham or likely spam.
345 var msgs []store.Message
346 var need int
347 var method reputationMethod
348 if m.RemoteIPMasked1 != "" {
349 q := messageQuery(&store.Message{RemoteIPMasked1: m.RemoteIPMasked1}, year/4, 50)
350 msgs = xmessageList(q, "ip1")
351 need = 2
352 method = methodIP1
353 }
354 if len(msgs) == 0 && m.RemoteIPMasked2 != "" {
355 q := messageQuery(&store.Message{RemoteIPMasked2: m.RemoteIPMasked2}, year/4, 50)
356 msgs = xmessageList(q, "ip2")
357 need = 5
358 method = methodIP2
359 }
360 if len(msgs) == 0 && m.RemoteIPMasked3 != "" {
361 q := messageQuery(&store.Message{RemoteIPMasked3: m.RemoteIPMasked3}, year/4, 50)
362 msgs = xmessageList(q, "ip3")
363 need = 10
364 method = methodIP3
365 }
366 if len(msgs) > 0 {
367 nspam := 0
368 for _, m := range msgs {
369 if m.Junk {
370 nspam++
371 }
372 }
373 pspam := float64(nspam) / float64(len(msgs))
374 var spam *bool
375 if pspam < .25 {
376 spam = xfalse
377 } else if pspam > .75 {
378 spam = xtrue
379 }
380 conclusive := len(msgs) >= need && (pspam <= 0.1 || pspam >= 0.9)
381 return spam, conclusive, method, nil
382 }
383
384 return nil, false, methodNone, nil
385}
386