1package message
2
3import (
4 "bufio"
5 "errors"
6 "fmt"
7 "io"
8 "regexp"
9 "slices"
10 "strings"
11
12 "golang.org/x/net/html"
13 "golang.org/x/net/html/atom"
14
15 "github.com/mjl-/mox/mlog"
16 "github.com/mjl-/mox/moxio"
17)
18
19// Preview returns a message preview, based on the first text/plain or text/html
20// part of the message that has textual content. Preview returns at most 256
21// characters (possibly more bytes). Callers may want to truncate and trim trailing
22// whitespace before using the preview.
23//
24// Preview logs at debug level for invalid messages. An error is only returned for
25// serious errors, like i/o errors.
26func (p Part) Preview(log mlog.Log) (string, error) {
27 // ../rfc/8970:190
28
29 // Don't use if Content-Disposition attachment.
30 disp, _, err := p.DispositionFilename()
31 if err != nil {
32 log.Debugx("parsing disposition/filename", err)
33 } else if strings.EqualFold(disp, "attachment") {
34 return "", nil
35 }
36
37 mt := p.MediaType + "/" + p.MediaSubType
38 switch mt {
39 case "TEXT/PLAIN", "/":
40 r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
41 s, err := previewText(r)
42 if err != nil {
43 if errors.Is(err, moxio.ErrLimit) {
44 log.Debug("no preview in first mb of text message")
45 return "", nil
46 }
47 return "", fmt.Errorf("making preview from text part: %v", err)
48 }
49 return s, nil
50
51 case "TEXT/HTML":
52 r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
53
54 // First turn the HTML into text.
55 s, err := previewHTML(r)
56 if err != nil {
57 log.Debugx("parsing html part for preview (ignored)", err)
58 return "", nil
59 }
60
61 // Turn text body into a preview text.
62 s, err = previewText(strings.NewReader(s))
63 if err != nil {
64 if errors.Is(err, moxio.ErrLimit) {
65 log.Debug("no preview in first mb of html message")
66 return "", nil
67 }
68 return "", fmt.Errorf("making preview from text from html: %v", err)
69 }
70 return s, nil
71
72 case "MULTIPART/ENCRYPTED":
73 return "", nil
74 }
75
76 for i, sp := range p.Parts {
77 if mt == "MULTIPART/SIGNED" && i >= 1 {
78 break
79 }
80 s, err := sp.Preview(log)
81 if err != nil || s != "" {
82 return s, err
83 }
84 }
85 return "", nil
86}
87
88// previewText returns a line the client can display next to the subject line
89// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
90// line with "[...]" so only new and useful information will be displayed.
91// Trailing signatures are not included.
92func previewText(r io.Reader) (string, error) {
93 // We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
94 var lines []string
95 scanner := bufio.NewScanner(r)
96 ensureLines := func() {
97 for len(lines) < 10 && scanner.Scan() {
98 lines = append(lines, strings.TrimSpace(scanner.Text()))
99 }
100 }
101 ensureLines()
102
103 isSnipped := func(s string) bool {
104 return s == "[...]" || s == "[…]" || s == "..."
105 }
106
107 nextLineQuoted := func(i int) bool {
108 if i+1 < len(lines) && lines[i+1] == "" {
109 i++
110 }
111 return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
112 }
113
114 // Remainder is signature if we see a line with only and minimum 2 dashes, and
115 // there are no more empty lines, and there aren't more than 5 lines left.
116 isSignature := func() bool {
117 if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
118 return false
119 }
120 l := lines[1:]
121 for len(l) > 0 && l[len(l)-1] == "" {
122 l = l[:len(l)-1]
123 }
124 if len(l) >= 5 {
125 return false
126 }
127 return !slices.Contains(l, "")
128 }
129
130 result := ""
131
132 resultSnipped := func() bool {
133 return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
134 }
135
136 // Quick check for initial wrapped "On ... wrote:" line.
137 if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
138 result = "[...]\n"
139 lines = lines[3:]
140 ensureLines()
141 }
142
143 for ; len(lines) > 0 && !isSignature(); ensureLines() {
144 line := lines[0]
145 if strings.HasPrefix(line, ">") {
146 if !resultSnipped() {
147 result += "[...]\n"
148 }
149 lines = lines[1:]
150 continue
151 }
152 if line == "" {
153 lines = lines[1:]
154 continue
155 }
156 // Check for a "On <date>, <person> wrote:", we require digits before a quoted
157 // line, with an optional empty line in between. If we don't have any text yet, we
158 // don't require the digits.
159 if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
160 if !resultSnipped() {
161 result += "[...]\n"
162 }
163 lines = lines[1:]
164 continue
165 }
166 // Skip possibly duplicate snipping by author.
167 if !isSnipped(line) || !resultSnipped() {
168 result += line + "\n"
169 }
170 lines = lines[1:]
171 if len(result) > 250 {
172 break
173 }
174 }
175
176 // Limit number of characters (not bytes). ../rfc/8970:200
177 // To 256 characters. ../rfc/8970:211
178 var o, n int
179 for o = range result {
180 n++
181 if n > 256 {
182 result = result[:o]
183 break
184 }
185 }
186
187 return result, scanner.Err()
188}
189
190// Any text inside these html elements (recursively) is ignored.
191var ignoreAtoms = atomMap(
192 atom.Dialog,
193 atom.Head,
194 atom.Map,
195 atom.Math,
196 atom.Script,
197 atom.Style,
198 atom.Svg,
199 atom.Template,
200)
201
202// Inline elements don't force newlines at beginning & end of text in this element.
203// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
204var inlineAtoms = atomMap(
205 atom.A,
206 atom.Abbr,
207 atom.B,
208 atom.Bdi,
209 atom.Bdo,
210 atom.Cite,
211 atom.Code,
212 atom.Data,
213 atom.Dfn,
214 atom.Em,
215 atom.I,
216 atom.Kbd,
217 atom.Mark,
218 atom.Q,
219 atom.Rp,
220 atom.Rt,
221 atom.Ruby,
222 atom.S,
223 atom.Samp,
224 atom.Small,
225 atom.Span,
226 atom.Strong,
227 atom.Sub,
228 atom.Sup,
229 atom.Time,
230 atom.U,
231 atom.Var,
232 atom.Wbr,
233
234 atom.Del,
235 atom.Ins,
236
237 // We treat these specially, inserting a space after them instead of a newline.
238 atom.Td,
239 atom.Th,
240)
241
242func atomMap(l ...atom.Atom) map[atom.Atom]bool {
243 m := map[atom.Atom]bool{}
244 for _, a := range l {
245 m[a] = true
246 }
247 return m
248}
249
250var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space.
251var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline.
252var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
253
254func previewHTML(r io.Reader) (string, error) {
255 // Stack/state, based on elements.
256 var ignores []bool
257 var inlines []bool
258
259 var text string // Collecting text.
260 var err error // Set when walking DOM.
261 var quoteLevel int
262
263 // We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
264 // whether we are in an inline or block element, and building up the text. We stop
265 // when we have enough data, returning false in that case.
266 var walk func(n *html.Node) bool
267 walk = func(n *html.Node) bool {
268 switch n.Type {
269 case html.ErrorNode:
270 err = fmt.Errorf("unexpected error node")
271 return false
272
273 case html.ElementNode:
274 ignores = append(ignores, ignoreAtoms[n.DataAtom])
275 inline := inlineAtoms[n.DataAtom]
276 inlines = append(inlines, inline)
277 if n.DataAtom == atom.Blockquote {
278 quoteLevel++
279 }
280 defer func() {
281 if n.DataAtom == atom.Blockquote {
282 quoteLevel--
283 }
284 if !inline && !strings.HasSuffix(text, "\n\n") {
285 text += "\n"
286 } else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
287 text += " "
288 }
289
290 ignores = ignores[:len(ignores)-1]
291 inlines = inlines[:len(inlines)-1]
292 }()
293
294 case html.TextNode:
295 if slices.Contains(ignores, true) {
296 return true
297 }
298 // Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
299 var s string
300 for _, c := range n.Data {
301 if c == '\r' {
302 continue
303 } else if c == '\t' {
304 s += " "
305 } else {
306 s += string(c)
307 }
308 }
309 s = regexpSpace.ReplaceAllString(s, " ")
310 s = regexpNewline.ReplaceAllString(s, "\n")
311 s = regexpZeroWidth.ReplaceAllString(s, "")
312
313 inline := len(inlines) > 0 && inlines[len(inlines)-1]
314 ts := strings.TrimSpace(s)
315 if !inline && ts == "" {
316 break
317 }
318 if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
319 if quoteLevel > 0 {
320 q := strings.Repeat("> ", quoteLevel)
321 var sb strings.Builder
322 for s != "" {
323 o := strings.IndexByte(s, '\n')
324 if o < 0 {
325 o = len(s)
326 } else {
327 o++
328 }
329 sb.WriteString(q)
330 sb.WriteString(s[:o])
331 s = s[o:]
332 }
333 s = sb.String()
334 }
335 text += s
336 }
337 // We need to generate at most 256 characters of preview. The text we're gathering
338 // will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
339 // 4k bytes is enough to read.
340 if len(text) >= 4*1024 {
341 return false
342 }
343 }
344 // Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
345
346 for cn := range n.ChildNodes() {
347 if !walk(cn) {
348 break
349 }
350 }
351
352 return true
353 }
354
355 node, err := html.Parse(r)
356 if err != nil {
357 return "", fmt.Errorf("parsing html: %v", err)
358 }
359
360 // Build text.
361 walk(node)
362
363 text = strings.TrimSpace(text)
364 text = regexpSpace.ReplaceAllString(text, " ")
365 return text, err
366}
367