11 "golang.org/x/net/html"
12 "golang.org/x/net/html/atom"
14 "github.com/mjl-/mox/mlog"
15 "github.com/mjl-/mox/moxio"
18// Preview returns a message preview, based on the first text/plain or text/html
19// part of the message that has textual content. Preview returns at most 256
20// characters (possibly more bytes). Callers may want to truncate and trim trailing
21// whitespace before using the preview.
23// Preview logs at debug level for invalid messages. An error is only returned for
24// serious errors, like i/o errors.
25func (p Part) Preview(log mlog.Log) (string, error) {
28 // Don't use if Content-Disposition attachment.
29 disp, _, err := p.DispositionFilename()
31 log.Debugx("parsing disposition/filename", err)
32 } else if strings.EqualFold(disp, "attachment") {
36 mt := p.MediaType + "/" + p.MediaSubType
38 case "TEXT/PLAIN", "/":
39 r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 100 * 1024}
40 s, err := previewText(r)
42 return "", fmt.Errorf("making preview from text part: %v", err)
47 r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
49 // First turn the HTML into text.
50 s, err := previewHTML(r)
52 log.Debugx("parsing html part for preview (ignored)", err)
56 // Turn text body into a preview text.
57 s, err = previewText(strings.NewReader(s))
59 return "", fmt.Errorf("making preview from text from html: %v", err)
63 case "MULTIPART/ENCRYPTED":
67 for i, sp := range p.Parts {
68 if mt == "MULTIPART/SIGNED" && i >= 1 {
71 s, err := sp.Preview(log)
72 if err != nil || s != "" {
79// previewText returns a line the client can display next to the subject line
80// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
81// line with "[...]" so only new and useful information will be displayed.
82// Trailing signatures are not included.
83func previewText(r io.Reader) (string, error) {
84 // We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
86 scanner := bufio.NewScanner(r)
87 ensureLines := func() {
88 for len(lines) < 10 && scanner.Scan() {
89 lines = append(lines, strings.TrimSpace(scanner.Text()))
94 isSnipped := func(s string) bool {
95 return s == "[...]" || s == "[…]" || s == "..."
98 nextLineQuoted := func(i int) bool {
99 if i+1 < len(lines) && lines[i+1] == "" {
102 return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
105 // Remainder is signature if we see a line with only and minimum 2 dashes, and
106 // there are no more empty lines, and there aren't more than 5 lines left.
107 isSignature := func() bool {
108 if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
112 for len(l) > 0 && l[len(l)-1] == "" {
118 return !slices.Contains(l, "")
123 resultSnipped := func() bool {
124 return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
127 // Quick check for initial wrapped "On ... wrote:" line.
128 if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
134 for ; len(lines) > 0 && !isSignature(); ensureLines() {
136 if strings.HasPrefix(line, ">") {
137 if !resultSnipped() {
147 // Check for a "On <date>, <person> wrote:", we require digits before a quoted
148 // line, with an optional empty line in between. If we don't have any text yet, we
149 // don't require the digits.
150 if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
151 if !resultSnipped() {
157 // Skip possibly duplicate snipping by author.
158 if !isSnipped(line) || !resultSnipped() {
159 result += line + "\n"
162 if len(result) > 250 {
170 for o = range result {
178 return result, scanner.Err()
181// Any text inside these html elements (recursively) is ignored.
182var ignoreAtoms = atomMap(
193// Inline elements don't force newlines at beginning & end of text in this element.
194// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
195var inlineAtoms = atomMap(
228 // We treat these specially, inserting a space after them instead of a newline.
233func atomMap(l ...atom.Atom) map[atom.Atom]bool {
234 m := map[atom.Atom]bool{}
235 for _, a := range l {
241var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space.
242var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline.
243var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
245func previewHTML(r io.Reader) (string, error) {
246 // Stack/state, based on elements.
250 var text string // Collecting text.
251 var err error // Set when walking DOM.
254 // We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
255 // whether we are in an inline or block element, and building up the text. We stop
256 // when we have enough data, returning false in that case.
257 var walk func(n *html.Node) bool
258 walk = func(n *html.Node) bool {
261 err = fmt.Errorf("unexpected error node")
264 case html.ElementNode:
265 ignores = append(ignores, ignoreAtoms[n.DataAtom])
266 inline := inlineAtoms[n.DataAtom]
267 inlines = append(inlines, inline)
268 if n.DataAtom == atom.Blockquote {
272 if n.DataAtom == atom.Blockquote {
275 if !inline && !strings.HasSuffix(text, "\n\n") {
277 } else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
281 ignores = ignores[:len(ignores)-1]
282 inlines = inlines[:len(inlines)-1]
286 if slices.Contains(ignores, true) {
289 // Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
291 for _, c := range n.Data {
294 } else if c == '\t' {
300 s = regexpSpace.ReplaceAllString(s, " ")
301 s = regexpNewline.ReplaceAllString(s, "\n")
302 s = regexpZeroWidth.ReplaceAllString(s, "")
304 inline := len(inlines) > 0 && inlines[len(inlines)-1]
305 ts := strings.TrimSpace(s)
306 if !inline && ts == "" {
309 if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
311 q := strings.Repeat("> ", quoteLevel)
312 var sb strings.Builder
314 o := strings.IndexByte(s, '\n')
321 sb.WriteString(s[:o])
328 // We need to generate at most 256 characters of preview. The text we're gathering
329 // will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
330 // 4k bytes is enough to read.
331 if len(text) >= 4*1024 {
335 // Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
337 for cn := range n.ChildNodes() {
346 node, err := html.Parse(r)
348 return "", fmt.Errorf("parsing html: %v", err)
354 text = strings.TrimSpace(text)
355 text = regexpSpace.ReplaceAllString(text, " ")