12 "golang.org/x/net/html"
13 "golang.org/x/net/html/atom"
15 "github.com/mjl-/mox/mlog"
16 "github.com/mjl-/mox/moxio"
19// Preview returns a message preview, based on the first text/plain or text/html
20// part of the message that has textual content. Preview returns at most 256
21// characters (possibly more bytes). Callers may want to truncate and trim trailing
22// whitespace before using the preview.
24// Preview logs at debug level for invalid messages. An error is only returned for
25// serious errors, like i/o errors.
26func (p Part) Preview(log mlog.Log) (string, error) {
29 // Don't use if Content-Disposition attachment.
30 disp, _, err := p.DispositionFilename()
32 log.Debugx("parsing disposition/filename", err)
33 } else if strings.EqualFold(disp, "attachment") {
37 mt := p.MediaType + "/" + p.MediaSubType
39 case "TEXT/PLAIN", "/":
40 r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
41 s, err := previewText(r)
43 if errors.Is(err, moxio.ErrLimit) {
44 log.Debug("no preview in first mb of text message")
47 return "", fmt.Errorf("making preview from text part: %v", err)
52 r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
54 // First turn the HTML into text.
55 s, err := previewHTML(r)
57 log.Debugx("parsing html part for preview (ignored)", err)
61 // Turn text body into a preview text.
62 s, err = previewText(strings.NewReader(s))
64 if errors.Is(err, moxio.ErrLimit) {
65 log.Debug("no preview in first mb of html message")
68 return "", fmt.Errorf("making preview from text from html: %v", err)
72 case "MULTIPART/ENCRYPTED":
76 for i, sp := range p.Parts {
77 if mt == "MULTIPART/SIGNED" && i >= 1 {
80 s, err := sp.Preview(log)
81 if err != nil || s != "" {
88// previewText returns a line the client can display next to the subject line
89// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
90// line with "[...]" so only new and useful information will be displayed.
91// Trailing signatures are not included.
92func previewText(r io.Reader) (string, error) {
93 // We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
95 scanner := bufio.NewScanner(r)
96 ensureLines := func() {
97 for len(lines) < 10 && scanner.Scan() {
98 lines = append(lines, strings.TrimSpace(scanner.Text()))
103 isSnipped := func(s string) bool {
104 return s == "[...]" || s == "[…]" || s == "..."
107 nextLineQuoted := func(i int) bool {
108 if i+1 < len(lines) && lines[i+1] == "" {
111 return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
114 // Remainder is signature if we see a line with only and minimum 2 dashes, and
115 // there are no more empty lines, and there aren't more than 5 lines left.
116 isSignature := func() bool {
117 if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
121 for len(l) > 0 && l[len(l)-1] == "" {
127 return !slices.Contains(l, "")
132 resultSnipped := func() bool {
133 return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
136 // Quick check for initial wrapped "On ... wrote:" line.
137 if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
143 for ; len(lines) > 0 && !isSignature(); ensureLines() {
145 if strings.HasPrefix(line, ">") {
146 if !resultSnipped() {
156 // Check for a "On <date>, <person> wrote:", we require digits before a quoted
157 // line, with an optional empty line in between. If we don't have any text yet, we
158 // don't require the digits.
159 if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
160 if !resultSnipped() {
166 // Skip possibly duplicate snipping by author.
167 if !isSnipped(line) || !resultSnipped() {
168 result += line + "\n"
171 if len(result) > 250 {
179 for o = range result {
187 return result, scanner.Err()
190// Any text inside these html elements (recursively) is ignored.
191var ignoreAtoms = atomMap(
202// Inline elements don't force newlines at beginning & end of text in this element.
203// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
204var inlineAtoms = atomMap(
237 // We treat these specially, inserting a space after them instead of a newline.
242func atomMap(l ...atom.Atom) map[atom.Atom]bool {
243 m := map[atom.Atom]bool{}
244 for _, a := range l {
250var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space.
251var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline.
252var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
254func previewHTML(r io.Reader) (string, error) {
255 // Stack/state, based on elements.
259 var text string // Collecting text.
260 var err error // Set when walking DOM.
263 // We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
264 // whether we are in an inline or block element, and building up the text. We stop
265 // when we have enough data, returning false in that case.
266 var walk func(n *html.Node) bool
267 walk = func(n *html.Node) bool {
270 err = fmt.Errorf("unexpected error node")
273 case html.ElementNode:
274 ignores = append(ignores, ignoreAtoms[n.DataAtom])
275 inline := inlineAtoms[n.DataAtom]
276 inlines = append(inlines, inline)
277 if n.DataAtom == atom.Blockquote {
281 if n.DataAtom == atom.Blockquote {
284 if !inline && !strings.HasSuffix(text, "\n\n") {
286 } else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
290 ignores = ignores[:len(ignores)-1]
291 inlines = inlines[:len(inlines)-1]
295 if slices.Contains(ignores, true) {
298 // Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
300 for _, c := range n.Data {
303 } else if c == '\t' {
309 s = regexpSpace.ReplaceAllString(s, " ")
310 s = regexpNewline.ReplaceAllString(s, "\n")
311 s = regexpZeroWidth.ReplaceAllString(s, "")
313 inline := len(inlines) > 0 && inlines[len(inlines)-1]
314 ts := strings.TrimSpace(s)
315 if !inline && ts == "" {
318 if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
320 q := strings.Repeat("> ", quoteLevel)
321 var sb strings.Builder
323 o := strings.IndexByte(s, '\n')
330 sb.WriteString(s[:o])
337 // We need to generate at most 256 characters of preview. The text we're gathering
338 // will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
339 // 4k bytes is enough to read.
340 if len(text) >= 4*1024 {
344 // Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
346 for cn := range n.ChildNodes() {
355 node, err := html.Parse(r)
357 return "", fmt.Errorf("parsing html: %v", err)
363 text = strings.TrimSpace(text)
364 text = regexpSpace.ReplaceAllString(text, " ")