10	"github.com/mjl-/bstore"
 
12	"github.com/mjl-/mox/dns"
 
13	"github.com/mjl-/mox/mlog"
 
14	"github.com/mjl-/mox/smtp"
 
15	"github.com/mjl-/mox/store"
 
18type reputationMethod string
 
21	methodMsgfromFull      reputationMethod = "msgfromfull"
 
22	methodMsgtoFull        reputationMethod = "msgtofull"
 
23	methodMsgfromDomain    reputationMethod = "msgfromdomain"
 
24	methodMsgfromOrgDomain reputationMethod = "msgfromorgdomain"
 
25	methodMsgtoDomain      reputationMethod = "msgtodomain"
 
26	methodMsgtoOrgDomain   reputationMethod = "msgtoorgdomain"
 
27	methodDKIMSPF          reputationMethod = "dkimspf"
 
28	methodIP1              reputationMethod = "ip1"
 
29	methodIP2              reputationMethod = "ip2"
 
30	methodIP3              reputationMethod = "ip3"
 
31	methodNone             reputationMethod = "none"
 
34// Reputation returns whether message m is likely junk.
 
36// This function is called after checking for a manually configured spf mailfrom
 
37// allow (e.g. for mailing lists), and after checking for a dmarc reject policy.
 
39// The decision is made based on historic messages delivered to the same
 
40// destination mailbox, MailboxOrigID. Because each mailbox may have a different
 
41// accept policy. We only use messages that have been marked as either junk or
 
42// non-junk. We help users by automatically marking them as non-junk when moving to
 
43// certain folders in the default config (e.g. the archive folder). We expect users
 
44// to mark junk messages as such when they read it. And to keep it in their inbox,
 
45// regular trash or archive if it is not.
 
47// The basic idea is to keep accepting messages that were accepted in the past, and
 
48// keep rejecting those that were rejected. This is relatively easy to check if
 
49// mail passes SPF and/or DKIM with Message-From alignment. Regular email from
 
50// known people will be let in. But spammers are trickier. They will use new IPs,
 
51// (sub)domains, no or newly created SPF and/or DKIM identifiers, new localparts,
 
52// etc. This function likely ends up returning "inconclusive" for such emails. The
 
53// junkfilter will have to take care of a final decision.
 
55// In case of doubt, it doesn't hurt much to accept another mail that a user has
 
56// communicated successfully with in the past. If the most recent message is marked
 
57// as junk that could have happened accidentally. If another message is let in, and
 
58// it is again junk, future messages will be rejected.
 
60// Actual spammers will probably try to use identifiers, i.e. (sub)domain, dkim/spf
 
61// identifiers and ip addresses for which we have no history. We may only have
 
62// ip-based reputation, perhaps only an ip range, perhaps nothing.
 
64// Some profiles of first-time senders:
 
66//   - Individuals. They can typically get past the junkfilter if needed.
 
67//   - Transactional emails. They should get past the junkfilter. If they use one of
 
68//     the larger email service providers, their reputation could help. If the
 
69//     junkfilter rejects the message, users can recover the message from the Rejects
 
70//     mailbox. The first message is typically initiated by a user, e.g. by registering.
 
71//   - Desired commercial email will have to get past the junkfilter based on its
 
72//     content. There will typically be earlier communication with the (organizational)
 
73//     domain that would let the message through.
 
74//   - Mailing list. May get past the junkfilter. If delivery is to a separate
 
75//     mailbox, the junkfilter will let it in because of little history. Long enough to
 
76//     build reputation based on DKIM/SPF signals. Users are best off to
 
77//     configure accept rules for messages from mailing lists.
 
79// The decision-making process looks at historic messages. The following properties
 
80// are checked until matching messages are found. If they are found, a decision is
 
81// returned, which may be inconclusive. The next property on the list is only
 
82// checked if a step did not match any messages.
 
84//   - Messages matching full "message from" address, either with strict/relaxed
 
85//     dkim/spf-verification, or without.
 
86//   - Messages the user sent to the "message from" address.
 
87//   - Messages matching only the domain of the "message from" address (different
 
88//     localpart), again with verification or without.
 
89//   - Messages sent to an address in the domain of the "message from" address.
 
90//   - The previous two checks again, but now checking against the organizational
 
91//     domain instead of the exact domain.
 
92//   - Matching DKIM domains and a matching SPF mailfrom, or mailfrom domain, or ehlo
 
94//   - "Exact" IP, or nearby IPs.
 
102func reputation(tx *bstore.Tx, log mlog.Log, m *store.Message, smtputf8 bool) (rjunk *bool, rconclusive bool, rmethod reputationMethod, reasonText string, rerr error) {
 
103	boolptr := func(v bool) *bool {
 
106	xfalse := boolptr(false)
 
107	xtrue := boolptr(true)
 
109	type queryError string
 
116		if xerr, ok := x.(queryError); ok {
 
117			rerr = errors.New(string(xerr))
 
125	// messageQuery returns a base query for historic seen messages to the same
 
126	// mailbox, at most maxAge old, and at most maxCount messages.
 
127	messageQuery := func(fm *store.Message, maxAge time.Duration, maxCount int) *bstore.Query[store.Message] {
 
128		q := bstore.QueryTx[store.Message](tx)
 
129		q.FilterEqual("MailboxOrigID", m.MailboxID)
 
130		q.FilterEqual("Expunged", false)
 
131		q.FilterFn(func(m store.Message) bool {
 
132			return m.Junk || m.Notjunk
 
137		q.FilterGreaterEqual("Received", now.Add(-maxAge))
 
139		q.SortDesc("Received")
 
143	// Execute the query, returning messages or returning error through panic.
 
144	xmessageList := func(q *bstore.Query[store.Message], descr string) []store.Message {
 
147		log.Debugx("querying messages for reputation", err,
 
148			slog.Int("msgs", len(l)),
 
149			slog.String("descr", descr),
 
150			slog.Duration("queryduration", time.Since(t0)))
 
152			panic(queryError(fmt.Sprintf("listing messages: %v", err)))
 
157	xrecipientExists := func(q *bstore.Query[store.Recipient]) bool {
 
158		exists, err := q.Exists()
 
160			panic(queryError(fmt.Sprintf("checking for recipient: %v", err)))
 
165	const year = 365 * 24 * time.Hour
 
167	// Look for historic messages with same "message from" address. We'll
 
168	// treat any validation (strict/dmarc/relaxed) the same, but "none"
 
171	// We only need 1 message, and sometimes look at a second message. If
 
172	// the last message or the message before was an accept, we accept. If
 
173	// the single last or last two were a reject, we reject.
 
175	// If there was no validation, any signal is inconclusive.
 
176	if m.MsgFromDomain != "" {
 
177		q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain}, 3*year, 2)
 
178		q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
 
179		msgs := xmessageList(q, "mgsfromfull")
 
181			// todo: we may want to look at dkim/spf in this case.
 
182			spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
 
183			conclusive := m.MsgFromValidated
 
184			return &spam, conclusive, methodMsgfromFull, "reputation of exact message-from address", nil
 
186		if !m.MsgFromValidated {
 
187			// Look for historic messages that were validated. If present, this is likely spam.
 
188			// Only return as conclusively spam if history also says this From-address sent
 
190			q := messageQuery(&store.Message{MsgFromLocalpart: m.MsgFromLocalpart, MsgFromDomain: m.MsgFromDomain, MsgFromValidated: true}, 3*year, 2)
 
191			msgs = xmessageList(q, "msgfromfull-validated")
 
193				spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
 
194				return xtrue, spam, methodMsgfromFull, "unvalidated message with validated historic messages with exact message-from address", nil
 
198		// Look if we ever sent to this address. If so, we accept,
 
199		qr := bstore.QueryTx[store.Recipient](tx)
 
200		qr.FilterEqual("Localpart", m.MsgFromLocalpart)
 
201		qr.FilterEqual("Domain", m.MsgFromDomain)
 
202		qr.FilterGreaterEqual("Sent", now.Add(-3*year))
 
203		if xrecipientExists(qr) {
 
204			return xfalse, true, methodMsgtoFull, "exact message-from address was earlier message recipient", nil
 
207		// Look for domain match, then for organizational domain match.
 
208		for _, orgdomain := range []bool{false, true} {
 
209			qm := store.Message{}
 
210			var method reputationMethod
 
211			var source, descr string
 
213				qm.MsgFromOrgDomain = m.MsgFromOrgDomain
 
214				method = methodMsgfromOrgDomain
 
215				source = "organizational domain of message-from address"
 
216				descr = "msgfromorgdomain"
 
218				qm.MsgFromDomain = m.MsgFromDomain
 
219				method = methodMsgfromDomain
 
220				source = "exact domain of message-from address"
 
221				descr = "msgfromdomain"
 
224			q := messageQuery(&qm, 2*year, 20)
 
225			q.FilterEqual("MsgFromValidated", m.MsgFromValidated)
 
226			msgs := xmessageList(q, descr)
 
229				for _, m := range msgs {
 
234				if 100*nonjunk/len(msgs) > 80 {
 
235					reasonText = fmt.Sprintf("positive reputation with %s based on %d messages", source, len(msgs))
 
236					return xfalse, true, method, reasonText, nil
 
239					// Only conclusive with at least 3 different localparts.
 
240					localparts := map[smtp.Localpart]struct{}{}
 
241					for _, m := range msgs {
 
242						localparts[m.MsgFromLocalpart] = struct{}{}
 
243						if len(localparts) == 3 {
 
244							reasonText = fmt.Sprintf("negative reputation of at least 3 addresses with %s based on %d messages", source, len(msgs))
 
245							return xtrue, true, method, reasonText, nil
 
248					reasonText = fmt.Sprintf("negative reputation with %s based on %d messages", source, len(msgs))
 
249					return xtrue, false, method, reasonText, nil
 
251				// Mixed signals from domain. We don't want to block a new sender.
 
252				reasonText = fmt.Sprintf("mixed signals with %s based on %d messages", source, len(msgs))
 
253				return nil, false, method, reasonText, nil
 
255			if !m.MsgFromValidated {
 
256				// Look for historic messages that were validated. If present, this is likely spam.
 
257				// Only return as conclusively spam if history also says this From-address sent
 
259				q := messageQuery(&qm, 2*year, 2)
 
260				q.FilterEqual("MsgFromValidated", true)
 
261				msgs = xmessageList(q, descr+"-validated")
 
263					spam := msgs[0].Junk && (len(msgs) == 1 || msgs[1].Junk)
 
264					reasonText = fmt.Sprintf("unvalidated message with %s while we have validated messages from that source", source)
 
265					return xtrue, spam, method, reasonText, nil
 
269			// Look if we ever sent to this address. If so, we accept,
 
270			qr := bstore.QueryTx[store.Recipient](tx)
 
272				qr.FilterEqual("OrgDomain", m.MsgFromOrgDomain)
 
273				method = methodMsgtoOrgDomain
 
274				source = "organizational domain of message-from address"
 
276				qr.FilterEqual("Domain", m.MsgFromDomain)
 
277				method = methodMsgtoDomain
 
278				source = "exact domain of message-from address"
 
280			qr.FilterGreaterEqual("Sent", now.Add(-2*year))
 
281			if xrecipientExists(qr) {
 
282				reasonText = fmt.Sprintf("%s was recipient address", source)
 
283				return xfalse, true, method, reasonText, nil
 
289	// We only use identities that passed validation. Failed identities are ignored. 
../rfc/6376:2447 
290	// todo future: we could do something with the DKIM identity (i=) field if it is more specific than just the domain (d=).
 
291	dkimspfsignals := []float64{}
 
292	dkimspfreasondoms := []string{}
 
294	for _, dom := range m.DKIMDomains {
 
295		q := messageQuery(nil, year/2, 50)
 
296		q.FilterIn("DKIMDomains", dom)
 
297		msgs := xmessageList(q, "dkimdomain")
 
300			for _, m := range msgs {
 
305			pspam := float64(nspam) / float64(len(msgs))
 
306			dkimspfsignals = append(dkimspfsignals, pspam)
 
307			dkimspfreasondoms = append(dkimspfreasondoms, dom)
 
308			dkimspfmsgs = len(msgs)
 
311	if m.MailFromValidated || m.EHLOValidated {
 
313		var msgs []store.Message
 
314		if m.MailFromValidated && m.MailFromDomain != "" {
 
315			dom = m.MailFromDomain
 
316			q := messageQuery(&store.Message{MailFromLocalpart: m.MailFromLocalpart, MailFromDomain: m.MailFromDomain}, year/2, 50)
 
317			msgs = xmessageList(q, "mailfrom")
 
319				q := messageQuery(&store.Message{MailFromDomain: m.MailFromDomain}, year/2, 50)
 
320				msgs = xmessageList(q, "mailfromdomain")
 
323		if len(msgs) == 0 && m.EHLOValidated && m.EHLODomain != "" {
 
325			q := messageQuery(&store.Message{EHLODomain: m.EHLODomain}, year/2, 50)
 
326			msgs = xmessageList(q, "ehlodomain")
 
330			for _, m := range msgs {
 
335			pspam := float64(nspam) / float64(len(msgs))
 
336			dkimspfsignals = append(dkimspfsignals, pspam)
 
337			dkimspfreasondoms = append(dkimspfreasondoms, dom)
 
338			if len(msgs) > dkimspfmsgs {
 
339				dkimspfmsgs = len(msgs)
 
343	if len(dkimspfsignals) > 0 {
 
345		var hamdoms, spamdoms []string
 
346		for i, p := range dkimspfsignals {
 
347			d, _ := dns.ParseDomain(dkimspfreasondoms[i])
 
350				hamdoms = append(hamdoms, d.XName(smtputf8))
 
353				spamdoms = append(spamdoms, d.XName(smtputf8))
 
356		if nham > 0 && nspam == 0 {
 
357			reasonText = fmt.Sprintf("positive dkim/spf reputation for domain(s) %s", strings.Join(hamdoms, ","))
 
358			return xfalse, true, methodDKIMSPF, reasonText, nil
 
360		if nspam > 0 && nham == 0 {
 
361			reasonText = fmt.Sprintf("negative dkim/spf reputation for domain(s) %s", strings.Join(hamdoms, ","))
 
362			return xtrue, dkimspfmsgs > 1, methodDKIMSPF, reasonText, nil
 
364		reasonText = fmt.Sprintf("mixed dkim/spf reputation, positive for %s, negative for %s", strings.Join(hamdoms, ","), strings.Join(spamdoms, ","))
 
365		return nil, false, methodDKIMSPF, reasonText, nil
 
368	// IP-based. A wider mask needs more messages to be conclusive.
 
369	// We require the resulting signal to be strong, i.e. likely ham or likely spam.
 
370	var msgs []store.Message
 
372	var method reputationMethod
 
374	if m.RemoteIPMasked1 != "" {
 
375		q := messageQuery(&store.Message{RemoteIPMasked1: m.RemoteIPMasked1}, year/4, 50)
 
376		msgs = xmessageList(q, "ip1")
 
379		ip = m.RemoteIPMasked1
 
381	if len(msgs) == 0 && m.RemoteIPMasked2 != "" {
 
382		q := messageQuery(&store.Message{RemoteIPMasked2: m.RemoteIPMasked2}, year/4, 50)
 
383		msgs = xmessageList(q, "ip2")
 
386		ip = m.RemoteIPMasked2
 
388	if len(msgs) == 0 && m.RemoteIPMasked3 != "" {
 
389		q := messageQuery(&store.Message{RemoteIPMasked3: m.RemoteIPMasked3}, year/4, 50)
 
390		msgs = xmessageList(q, "ip3")
 
393		ip = m.RemoteIPMasked3
 
397		for _, m := range msgs {
 
402		pspam := float64(nspam) / float64(len(msgs))
 
406		} else if pspam > .75 {
 
409		conclusive := len(msgs) >= need && (pspam <= 0.1 || pspam >= 0.9)
 
410		v6 := strings.Contains(m.RemoteIP, ":")
 
411		reasonText = fmt.Sprintf("reputation for ip %s%s, spam score %.2f", ip, maskclasses[classmask{v6, method}], pspam)
 
412		return spam, conclusive, method, reasonText, nil
 
415	return nil, false, methodNone, "no address/spf/dkim/ip reputation", nil
 
418type classmask struct {
 
420	method reputationMethod
 
423var maskclasses = map[classmask]string{
 
424	{false, methodIP1}: "/32",
 
425	{false, methodIP2}: "/26",
 
426	{false, methodIP3}: "/21",
 
427	{true, methodIP1}:  "/64",
 
428	{true, methodIP2}:  "/48",
 
429	{true, methodIP3}:  "/32",