1package message
2
3import (
4 "bufio"
5 "fmt"
6 "io"
7 "regexp"
8 "slices"
9 "strings"
10
11 "golang.org/x/net/html"
12 "golang.org/x/net/html/atom"
13
14 "github.com/mjl-/mox/mlog"
15 "github.com/mjl-/mox/moxio"
16)
17
18// Preview returns a message preview, based on the first text/plain or text/html
19// part of the message that has textual content. Preview returns at most 256
20// characters (possibly more bytes). Callers may want to truncate and trim trailing
21// whitespace before using the preview.
22//
23// Preview logs at debug level for invalid messages. An error is only returned for
24// serious errors, like i/o errors.
25func (p Part) Preview(log mlog.Log) (string, error) {
26 // ../rfc/8970:190
27
28 // Don't use if Content-Disposition attachment.
29 disp, _, err := p.DispositionFilename()
30 if err != nil {
31 log.Debugx("parsing disposition/filename", err)
32 } else if strings.EqualFold(disp, "attachment") {
33 return "", nil
34 }
35
36 mt := p.MediaType + "/" + p.MediaSubType
37 switch mt {
38 case "TEXT/PLAIN", "/":
39 r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 100 * 1024}
40 s, err := previewText(r)
41 if err != nil {
42 return "", fmt.Errorf("making preview from text part: %v", err)
43 }
44 return s, nil
45
46 case "TEXT/HTML":
47 r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
48
49 // First turn the HTML into text.
50 s, err := previewHTML(r)
51 if err != nil {
52 log.Debugx("parsing html part for preview (ignored)", err)
53 return "", nil
54 }
55
56 // Turn text body into a preview text.
57 s, err = previewText(strings.NewReader(s))
58 if err != nil {
59 return "", fmt.Errorf("making preview from text from html: %v", err)
60 }
61 return s, nil
62
63 case "MULTIPART/ENCRYPTED":
64 return "", nil
65 }
66
67 for i, sp := range p.Parts {
68 if mt == "MULTIPART/SIGNED" && i >= 1 {
69 break
70 }
71 s, err := sp.Preview(log)
72 if err != nil || s != "" {
73 return s, err
74 }
75 }
76 return "", nil
77}
78
79// previewText returns a line the client can display next to the subject line
80// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
81// line with "[...]" so only new and useful information will be displayed.
82// Trailing signatures are not included.
83func previewText(r io.Reader) (string, error) {
84 // We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
85 var lines []string
86 scanner := bufio.NewScanner(r)
87 ensureLines := func() {
88 for len(lines) < 10 && scanner.Scan() {
89 lines = append(lines, strings.TrimSpace(scanner.Text()))
90 }
91 }
92 ensureLines()
93
94 isSnipped := func(s string) bool {
95 return s == "[...]" || s == "[…]" || s == "..."
96 }
97
98 nextLineQuoted := func(i int) bool {
99 if i+1 < len(lines) && lines[i+1] == "" {
100 i++
101 }
102 return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
103 }
104
105 // Remainder is signature if we see a line with only and minimum 2 dashes, and
106 // there are no more empty lines, and there aren't more than 5 lines left.
107 isSignature := func() bool {
108 if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
109 return false
110 }
111 l := lines[1:]
112 for len(l) > 0 && l[len(l)-1] == "" {
113 l = l[:len(l)-1]
114 }
115 if len(l) >= 5 {
116 return false
117 }
118 return !slices.Contains(l, "")
119 }
120
121 result := ""
122
123 resultSnipped := func() bool {
124 return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
125 }
126
127 // Quick check for initial wrapped "On ... wrote:" line.
128 if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
129 result = "[...]\n"
130 lines = lines[3:]
131 ensureLines()
132 }
133
134 for ; len(lines) > 0 && !isSignature(); ensureLines() {
135 line := lines[0]
136 if strings.HasPrefix(line, ">") {
137 if !resultSnipped() {
138 result += "[...]\n"
139 }
140 lines = lines[1:]
141 continue
142 }
143 if line == "" {
144 lines = lines[1:]
145 continue
146 }
147 // Check for a "On <date>, <person> wrote:", we require digits before a quoted
148 // line, with an optional empty line in between. If we don't have any text yet, we
149 // don't require the digits.
150 if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
151 if !resultSnipped() {
152 result += "[...]\n"
153 }
154 lines = lines[1:]
155 continue
156 }
157 // Skip possibly duplicate snipping by author.
158 if !isSnipped(line) || !resultSnipped() {
159 result += line + "\n"
160 }
161 lines = lines[1:]
162 if len(result) > 250 {
163 break
164 }
165 }
166
167 // Limit number of characters (not bytes). ../rfc/8970:200
168 // To 256 characters. ../rfc/8970:211
169 var o, n int
170 for o = range result {
171 n++
172 if n > 256 {
173 result = result[:o]
174 break
175 }
176 }
177
178 return result, scanner.Err()
179}
180
181// Any text inside these html elements (recursively) is ignored.
182var ignoreAtoms = atomMap(
183 atom.Dialog,
184 atom.Head,
185 atom.Map,
186 atom.Math,
187 atom.Script,
188 atom.Style,
189 atom.Svg,
190 atom.Template,
191)
192
193// Inline elements don't force newlines at beginning & end of text in this element.
194// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
195var inlineAtoms = atomMap(
196 atom.A,
197 atom.Abbr,
198 atom.B,
199 atom.Bdi,
200 atom.Bdo,
201 atom.Cite,
202 atom.Code,
203 atom.Data,
204 atom.Dfn,
205 atom.Em,
206 atom.I,
207 atom.Kbd,
208 atom.Mark,
209 atom.Q,
210 atom.Rp,
211 atom.Rt,
212 atom.Ruby,
213 atom.S,
214 atom.Samp,
215 atom.Small,
216 atom.Span,
217 atom.Strong,
218 atom.Sub,
219 atom.Sup,
220 atom.Time,
221 atom.U,
222 atom.Var,
223 atom.Wbr,
224
225 atom.Del,
226 atom.Ins,
227
228 // We treat these specially, inserting a space after them instead of a newline.
229 atom.Td,
230 atom.Th,
231)
232
233func atomMap(l ...atom.Atom) map[atom.Atom]bool {
234 m := map[atom.Atom]bool{}
235 for _, a := range l {
236 m[a] = true
237 }
238 return m
239}
240
241var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space.
242var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline.
243var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
244
245func previewHTML(r io.Reader) (string, error) {
246 // Stack/state, based on elements.
247 var ignores []bool
248 var inlines []bool
249
250 var text string // Collecting text.
251 var err error // Set when walking DOM.
252 var quoteLevel int
253
254 // We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
255 // whether we are in an inline or block element, and building up the text. We stop
256 // when we have enough data, returning false in that case.
257 var walk func(n *html.Node) bool
258 walk = func(n *html.Node) bool {
259 switch n.Type {
260 case html.ErrorNode:
261 err = fmt.Errorf("unexpected error node")
262 return false
263
264 case html.ElementNode:
265 ignores = append(ignores, ignoreAtoms[n.DataAtom])
266 inline := inlineAtoms[n.DataAtom]
267 inlines = append(inlines, inline)
268 if n.DataAtom == atom.Blockquote {
269 quoteLevel++
270 }
271 defer func() {
272 if n.DataAtom == atom.Blockquote {
273 quoteLevel--
274 }
275 if !inline && !strings.HasSuffix(text, "\n\n") {
276 text += "\n"
277 } else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
278 text += " "
279 }
280
281 ignores = ignores[:len(ignores)-1]
282 inlines = inlines[:len(inlines)-1]
283 }()
284
285 case html.TextNode:
286 if slices.Contains(ignores, true) {
287 return true
288 }
289 // Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
290 var s string
291 for _, c := range n.Data {
292 if c == '\r' {
293 continue
294 } else if c == '\t' {
295 s += " "
296 } else {
297 s += string(c)
298 }
299 }
300 s = regexpSpace.ReplaceAllString(s, " ")
301 s = regexpNewline.ReplaceAllString(s, "\n")
302 s = regexpZeroWidth.ReplaceAllString(s, "")
303
304 inline := len(inlines) > 0 && inlines[len(inlines)-1]
305 ts := strings.TrimSpace(s)
306 if !inline && ts == "" {
307 break
308 }
309 if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
310 if quoteLevel > 0 {
311 q := strings.Repeat("> ", quoteLevel)
312 var sb strings.Builder
313 for s != "" {
314 o := strings.IndexByte(s, '\n')
315 if o < 0 {
316 o = len(s)
317 } else {
318 o++
319 }
320 sb.WriteString(q)
321 sb.WriteString(s[:o])
322 s = s[o:]
323 }
324 s = sb.String()
325 }
326 text += s
327 }
328 // We need to generate at most 256 characters of preview. The text we're gathering
329 // will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
330 // 4k bytes is enough to read.
331 if len(text) >= 4*1024 {
332 return false
333 }
334 }
335 // Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
336
337 for cn := range n.ChildNodes() {
338 if !walk(cn) {
339 break
340 }
341 }
342
343 return true
344 }
345
346 node, err := html.Parse(r)
347 if err != nil {
348 return "", fmt.Errorf("parsing html: %v", err)
349 }
350
351 // Build text.
352 walk(node)
353
354 text = strings.TrimSpace(text)
355 text = regexpSpace.ReplaceAllString(text, " ")
356 return text, err
357}
358