1package smtpserver
2
3import (
4 "errors"
5 "fmt"
6 "time"
7
8 "golang.org/x/exp/slog"
9
10 "github.com/mjl-/bstore"
11
12 "github.com/mjl-/mox/mlog"
13 "github.com/mjl-/mox/smtp"
14 "github.com/mjl-/mox/store"
15)
16
17type reputationMethod string
18
19const (
20 methodMsgfromFull reputationMethod = "msgfromfull"
21 methodMsgtoFull reputationMethod = "msgtofull"
22 methodMsgfromDomain reputationMethod = "msgfromdomain"
23 methodMsgfromOrgDomain reputationMethod = "msgfromorgdomain"
24 methodMsgtoDomain reputationMethod = "msgtodomain"
25 methodMsgtoOrgDomain reputationMethod = "msgtoorgdomain"
26 methodDKIMSPF reputationMethod = "dkimspf"
27 methodIP1 reputationMethod = "ip1"
28 methodIP2 reputationMethod = "ip2"
29 methodIP3 reputationMethod = "ip3"
30 methodNone reputationMethod = "none"
31)
32
33// Reputation returns whether message m is likely junk.
34//
35// This function is called after checking for a manually configured spf mailfrom
36// allow (e.g. for mailing lists), and after checking for a dmarc reject policy.
37//
38// The decision is made based on historic messages delivered to the same
39// destination mailbox, MailboxOrigID. Because each mailbox may have a different
40// accept policy. We only use messages that have been marked as either junk or
41// non-junk. We help users by automatically marking them as non-junk when moving to
42// certain folders in the default config (e.g. the archive folder). We expect users
43// to mark junk messages as such when they read it. And to keep it in their inbox,
44// regular trash or archive if it is not.
45//
46// The basic idea is to keep accepting messages that were accepted in the past, and
47// keep rejecting those that were rejected. This is relatively easy to check if
48// mail passes SPF and/or DKIM with Message-From alignment. Regular email from
49// known people will be let in. But spammers are trickier. They will use new IPs,
50// (sub)domains, no or newly created SPF and/or DKIM identifiers, new localparts,
51// etc. This function likely ends up returning "inconclusive" for such emails. The
52// junkfilter will have to take care of a final decision.
53//
54// In case of doubt, it doesn't hurt much to accept another mail that a user has
55// communicated successfully with in the past. If the most recent message is marked
56// as junk that could have happened accidentally. If another message is let in, and
57// it is again junk, future messages will be rejected.
58//
59// Actual spammers will probably try to use identifiers, i.e. (sub)domain, dkim/spf
60// identifiers and ip addresses for which we have no history. We may only have
61// ip-based reputation, perhaps only an ip range, perhaps nothing.
62//
63// Some profiles of first-time senders:
64//
65// - Individuals. They can typically get past the junkfilter if needed.
66// - Transactional emails. They should get past the junkfilter. If they use one of
67// the larger email service providers, their reputation could help. If the
68// junkfilter rejects the message, users can recover the message from the Rejects
69// mailbox. The first message is typically initiated by a user, e.g. by registering.
70// - Desired commercial email will have to get past the junkfilter based on its
71// content. There will typically be earlier communication with the (organizational)
72// domain that would let the message through.
73// - Mailing list. May get past the junkfilter. If delivery is to a separate
74// mailbox, the junkfilter will let it in because of little history. Long enough to
75// build reputation based on DKIM/SPF signals. Users are best off to
76// configure accept rules for messages from mailing lists.
77//
78// The decision-making process looks at historic messages. The following properties
79// are checked until matching messages are found. If they are found, a decision is
80// returned, which may be inconclusive. The next property on the list is only
81// checked if a step did not match any messages.
82//
83// - Messages matching full "message from" address, either with strict/relaxed
84// dkim/spf-verification, or without.
85// - Messages the user sent to the "message from" address.
86// - Messages matching only the domain of the "message from" address (different
87// localpart), again with verification or without.
88// - Messages sent to an address in the domain of the "message from" address.
89// - The previous two checks again, but now checking against the organizational
90// domain instead of the exact domain.
91// - Matching DKIM domains and a matching SPF mailfrom, or mailfrom domain, or ehlo
92// domain.
93// - "Exact" IP, or nearby IPs.
94//
95// References:
96// ../rfc/5863
97// ../rfc/7960
98// ../rfc/6376:1915
99// ../rfc/6376:3716
100// ../rfc/7208:2167
101func reputation(tx *bstore.Tx, log mlog.Log, m *store.Message) (rjunk *bool, rconclusive bool, rmethod reputationMethod, rerr error) {
102 boolptr := func(v bool) *bool {
103 return &v
104 }
105 xfalse := boolptr(false)
106 xtrue := boolptr(true)
107
108 type queryError string
109
110 defer func() {
111 x := recover()
112 if x == nil {
113 return
114 }
115 if xerr, ok := x.(queryError); ok {
116 rerr = errors.New(string(xerr))
117 return
118 }
119 panic(x)
120 }()
121
122 now := time.Now()
123
124 // messageQuery returns a base query for historic seen messages to the same
125 // mailbox, at most maxAge old, and at most maxCount messages.
126 messageQuery := func(fm *store.Message, maxAge time.Duration, maxCount int) *bstore.Query[store.Message] {
127 q := bstore.QueryTx[store.Message](tx)
128 q.FilterEqual("MailboxOrigID", m.MailboxID)
129 q.FilterEqual("Expunged", false)
130 q.FilterFn(func(m store.Message) bool {
131 return m.Junk || m.Notjunk
132 })
133 if fm != nil {
134 q.FilterNonzero(*fm)
135 }
136 q.FilterGreaterEqual("Received", now.Add(-maxAge))
137 q.Limit(maxCount)
138 q.SortDesc("Received")
139 return q
140 }
141
142 // Execute the query, returning messages or returning error through panic.
143 xmessageList := func(q *bstore.Query[store.Message], descr string) []store.Message {
144 t0 := time.Now()
145 l, err := q.List()
146 log.Debugx("querying messages for reputation", err,
147 slog.Int("msgs", len(l)),
148 slog.String("descr", descr),
149 slog.Duration("queryduration", time.Since(t0)))
150 if err != nil {
151 panic(queryError(fmt.Sprintf("listing messages: %v", err)))
152 }
153 return l
154 }
155
156 xrecipientExists := func(q *bstore.Query[store.Recipient]) bool {
157 exists, err := q.Exists()
158 if err != nil {
159 panic(queryError(fmt.Sprintf("checking for recipient: %v", err)))
160 }
161 return exists
162 }
163
164 const year = 365 * 24 * time.Hour
165
166 // Look for historic messages with same "message from" address. We'll
167 // treat any validation (strict/dmarc/relaxed) the same, but "none"
168 // separately.
169 //
170 // We only need 1 message, and sometimes look at a second message. If
171 // the last message or the message before was an accept, we accept. If
172 // the single last or last two were a reject, we reject.
173 //
174 // If there was no validation, any signal is inconclusive.
175 if m.MsgFromDomain != "" {
176 q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain}, 3*year, 2)
177 q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
178 msgs := xmessageList(q, "mgsfromfull")
179 if len(msgs) > 0 {
180 // todo: we may want to look at dkim/spf in this case.
181 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
182 conclusive := m.MsgFromValidated
183 return &spam, conclusive, methodMsgfromFull, nil
184 }
185 if !m.MsgFromValidated {
186 // Look for historic messages that were validated. If present, this is likely spam.
187 // Only return as conclusively spam if history also says this From-address sent
188 // spam.
189 q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain, MsgFromValidated: true}, 3*year, 2)
190 msgs = xmessageList(q, "msgfromfull-validated")
191 if len(msgs) > 0 {
192 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
193 return xtrue, spam, methodMsgfromFull, nil
194 }
195 }
196
197 // Look if we ever sent to this address. If so, we accept,
198 qr := bstore.QueryTx[store.Recipient](tx)
199 qr.FilterEqual("Localpart", m.MsgFromLocalpart)
200 qr.FilterEqual("Domain", m.MsgFromDomain)
201 qr.FilterGreaterEqual("Sent", now.Add(-3*year))
202 if xrecipientExists(qr) {
203 return xfalse, true, methodMsgtoFull, nil
204 }
205
206 // Look for domain match, then for organizational domain match.
207 for _, orgdomain := range []bool{false, true} {
208 qm := store.Message{}
209 var method reputationMethod
210 var descr string
211 if orgdomain {
212 qm.MsgFromOrgDomain = m.MsgFromOrgDomain
213 method = methodMsgfromOrgDomain
214 descr = "msgfromorgdomain"
215 } else {
216 qm.MsgFromDomain = m.MsgFromDomain
217 method = methodMsgfromDomain
218 descr = "msgfromdomain"
219 }
220
221 q := messageQuery(&qm, 2*year, 20)
222 q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
223 msgs := xmessageList(q, descr)
224 if len(msgs) > 0 {
225 nonjunk := 0
226 for _, m := range msgs {
227 if !m.Junk {
228 nonjunk++
229 }
230 }
231 if 100*nonjunk/len(msgs) > 80 {
232 return xfalse, true, method, nil
233 }
234 if nonjunk == 0 {
235 // Only conclusive with at least 3 different localparts.
236 localparts := map[smtp.Localpart]struct{}{}
237 for _, m := range msgs {
238 localparts[m.MsgFromLocalpart] = struct{}{}
239 if len(localparts) == 3 {
240 return xtrue, true, method, nil
241 }
242 }
243 return xtrue, false, method, nil
244 }
245 // Mixed signals from domain. We don't want to block a new sender.
246 return nil, false, method, nil
247 }
248 if !m.MsgFromValidated {
249 // Look for historic messages that were validated. If present, this is likely spam.
250 // Only return as conclusively spam if history also says this From-address sent
251 // spam.
252 q := messageQuery(&qm, 2*year, 2)
253 q.FilterEqual("MsgFromValidated", true)
254 msgs = xmessageList(q, descr+"-validated")
255 if len(msgs) > 0 {
256 spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
257 return xtrue, spam, method, nil
258 }
259 }
260
261 // Look if we ever sent to this address. If so, we accept,
262 qr := bstore.QueryTx[store.Recipient](tx)
263 if orgdomain {
264 qr.FilterEqual("OrgDomain", m.MsgFromOrgDomain)
265 method = methodMsgtoOrgDomain
266 } else {
267 qr.FilterEqual("Domain", m.MsgFromDomain)
268 method = methodMsgtoDomain
269 }
270 qr.FilterGreaterEqual("Sent", now.Add(-2*year))
271 if xrecipientExists(qr) {
272 return xfalse, true, method, nil
273 }
274 }
275 }
276
277 // DKIM and SPF.
278 // We only use identities that passed validation. Failed identities are ignored. ../rfc/6376:2447
279 // todo future: we could do something with the DKIM identity (i=) field if it is more specific than just the domain (d=).
280 dkimspfsignals := []float64{}
281 dkimspfmsgs := 0
282 for _, dom := range m.DKIMDomains {
283 q := messageQuery(nil, year/2, 50)
284 q.FilterIn("DKIMDomains", dom)
285 msgs := xmessageList(q, "dkimdomain")
286 if len(msgs) > 0 {
287 nspam := 0
288 for _, m := range msgs {
289 if m.Junk {
290 nspam++
291 }
292 }
293 pspam := float64(nspam) / float64(len(msgs))
294 dkimspfsignals = append(dkimspfsignals, pspam)
295 dkimspfmsgs = len(msgs)
296 }
297 }
298 if m.MailFromValidated || m.EHLOValidated {
299 var msgs []store.Message
300 if m.MailFromValidated && m.MailFromDomain != "" {
301 q := messageQuery(&store.Message{MailFromLocalpart: m.MailFromLocalpart, MailFromDomain: m.MailFromDomain}, year/2, 50)
302 msgs = xmessageList(q, "mailfrom")
303 if len(msgs) == 0 {
304 q := messageQuery(&store.Message{MailFromDomain: m.MailFromDomain}, year/2, 50)
305 msgs = xmessageList(q, "mailfromdomain")
306 }
307 }
308 if len(msgs) == 0 && m.EHLOValidated && m.EHLODomain != "" {
309 q := messageQuery(&store.Message{EHLODomain: m.EHLODomain}, year/2, 50)
310 msgs = xmessageList(q, "ehlodomain")
311 }
312 if len(msgs) > 0 {
313 nspam := 0
314 for _, m := range msgs {
315 if m.Junk {
316 nspam++
317 }
318 }
319 pspam := float64(nspam) / float64(len(msgs))
320 dkimspfsignals = append(dkimspfsignals, pspam)
321 if len(msgs) > dkimspfmsgs {
322 dkimspfmsgs = len(msgs)
323 }
324 }
325 }
326 if len(dkimspfsignals) > 0 {
327 var nham, nspam int
328 for _, p := range dkimspfsignals {
329 if p < .1 {
330 nham++
331 } else if p > .9 {
332 nspam++
333 }
334 }
335 if nham > 0 && nspam == 0 {
336 return xfalse, true, methodDKIMSPF, nil
337 }
338 if nspam > 0 && nham == 0 {
339 return xtrue, dkimspfmsgs > 1, methodDKIMSPF, nil
340 }
341 return nil, false, methodDKIMSPF, nil
342 }
343
344 // IP-based. A wider mask needs more messages to be conclusive.
345 // We require the resulting signal to be strong, i.e. likely ham or likely spam.
346 var msgs []store.Message
347 var need int
348 var method reputationMethod
349 if m.RemoteIPMasked1 != "" {
350 q := messageQuery(&store.Message{RemoteIPMasked1: m.RemoteIPMasked1}, year/4, 50)
351 msgs = xmessageList(q, "ip1")
352 need = 2
353 method = methodIP1
354 }
355 if len(msgs) == 0 && m.RemoteIPMasked2 != "" {
356 q := messageQuery(&store.Message{RemoteIPMasked2: m.RemoteIPMasked2}, year/4, 50)
357 msgs = xmessageList(q, "ip2")
358 need = 5
359 method = methodIP2
360 }
361 if len(msgs) == 0 && m.RemoteIPMasked3 != "" {
362 q := messageQuery(&store.Message{RemoteIPMasked3: m.RemoteIPMasked3}, year/4, 50)
363 msgs = xmessageList(q, "ip3")
364 need = 10
365 method = methodIP3
366 }
367 if len(msgs) > 0 {
368 nspam := 0
369 for _, m := range msgs {
370 if m.Junk {
371 nspam++
372 }
373 }
374 pspam := float64(nspam) / float64(len(msgs))
375 var spam *bool
376 if pspam < .25 {
377 spam = xfalse
378 } else if pspam > .75 {
379 spam = xtrue
380 }
381 conclusive := len(msgs) >= need && (pspam <= 0.1 || pspam >= 0.9)
382 return spam, conclusive, method, nil
383 }
384
385 return nil, false, methodNone, nil
386}
387