1package message
2
3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").
4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...
5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.
6// todo: handle comments in headers?
7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.
8// todo: can use a cleanup
9
10import (
11 "bufio"
12 "bytes"
13 "encoding/base64"
14 "errors"
15 "fmt"
16 "io"
17 "log/slog"
18 "mime"
19 "mime/quotedprintable"
20 "net/mail"
21 "net/textproto"
22 "strings"
23 "time"
24 "unicode"
25
26 "golang.org/x/text/encoding/ianaindex"
27
28 "github.com/mjl-/mox/mlog"
29 "github.com/mjl-/mox/smtp"
30 "slices"
31)
32
33// Pedantic enables stricter parsing.
34var Pedantic bool
35
36var (
37 ErrBadContentType = errors.New("bad content-type")
38 ErrHeader = errors.New("bad message header")
39)
40
41var (
42 errNotMultipart = errors.New("not a multipart message")
43 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")
44 errLineTooLong = errors.New("line too long")
45 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")
46 errMissingClosingBoundary = errors.New("eof without closing boundary")
47 errBareLF = errors.New("invalid bare line feed")
48 errBareCR = errors.New("invalid bare carriage return")
49 errUnexpectedEOF = errors.New("unexpected eof")
50)
51
52// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.
53var enforceSequential bool
54
55// Part represents a whole mail message, or a part of a multipart message. It
56// is designed to handle IMAP requirements efficiently.
57type Part struct {
58 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.
59 HeaderOffset int64 // Offset in message file where header starts.
60 BodyOffset int64 // Offset in message file where body starts.
61 EndOffset int64 // Where body of part ends. Set when part is fully read.
62 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.
63 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.
64
65 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.
66 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".
67 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.
68 ContentID string
69 ContentDescription string
70 ContentTransferEncoding string // In upper case.
71 Envelope *Envelope // Email message headers. Not for non-message parts.
72
73 Parts []Part // Parts if this is a multipart.
74
75 // Only for message/rfc822 and message/global. This part may have a buffer as
76 // backing io.ReaderAt, because a message/global can have a non-identity
77 // content-transfer-encoding. This part has a nil parent.
78 Message *Part
79
80 r io.ReaderAt
81 header textproto.MIMEHeader // Parsed header.
82 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.
83 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.
84 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.
85 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.
86 strict bool // If set, valid crlf line endings are verified when reading body.
87}
88
89// todo: have all Content* fields in Part?
90// todo: make Address contain a type Localpart and dns.Domain?
91// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.
92
93// Envelope holds the basic/common message headers as used in IMAP4.
94type Envelope struct {
95 Date time.Time
96 Subject string // Q/B-word-decoded.
97 From []Address
98 Sender []Address
99 ReplyTo []Address
100 To []Address
101 CC []Address
102 BCC []Address
103 InReplyTo string // From In-Reply-To header, includes <>.
104 MessageID string // From Message-Id header, includes <>.
105}
106
107// Address as used in From and To headers.
108type Address struct {
109 Name string // Free-form name for display in mail applications.
110 User string // Localpart, encoded as string. Must be parsed before using as Localpart.
111 Host string // Domain in ASCII.
112}
113
114// Parse reads the headers of the mail message and returns a part.
115// A part provides access to decoded and raw contents of a message and its multiple parts.
116//
117// If strict is set, fewer attempts are made to continue parsing when errors are
118// encountered, such as with invalid content-type headers or bare carriage returns.
119func Parse(elog *slog.Logger, strict bool, r io.ReaderAt) (Part, error) {
120 log := mlog.New("message", elog)
121 return newPart(log, strict, r, 0, nil)
122}
123
124// EnsurePart parses a part as with Parse, but ensures a usable part is always
125// returned, even if error is non-nil. If a parse error occurs, the message is
126// returned as application/octet-stream, and headers can still be read if they
127// were valid.
128//
129// If strict is set, fewer attempts are made to continue parsing when errors are
130// encountered, such as with invalid content-type headers or bare carriage returns.
131func EnsurePart(elog *slog.Logger, strict bool, r io.ReaderAt, size int64) (Part, error) {
132 log := mlog.New("message", elog)
133 p, err := Parse(log.Logger, strict, r)
134 if err == nil {
135 err = p.Walk(log.Logger, nil)
136 }
137 if err != nil {
138 np, err2 := fallbackPart(p, r, size)
139 if err2 != nil {
140 err = err2
141 }
142 p = np
143 }
144 return p, err
145}
146
147func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {
148 np := Part{
149 HeaderOffset: p.HeaderOffset,
150 BodyOffset: p.BodyOffset,
151 EndOffset: size,
152 MediaType: "APPLICATION",
153 MediaSubType: "OCTET-STREAM",
154 ContentTypeParams: p.ContentTypeParams,
155 ContentID: p.ContentID,
156 ContentDescription: p.ContentDescription,
157 ContentTransferEncoding: p.ContentTransferEncoding,
158 Envelope: p.Envelope,
159 // We don't keep:
160 // - BoundaryOffset: irrelevant for top-level message.
161 // - RawLineCount and DecodedSize: set below.
162 // - Parts: we are not treating this as a multipart message.
163 }
164 np.SetReaderAt(r)
165 // By reading body, the number of lines and decoded size will be set.
166 _, err := io.Copy(io.Discard, np.Reader())
167 return np, err
168}
169
170// SetReaderAt sets r as reader for this part and all its sub parts, recursively.
171// No reader is set for any Message subpart, see SetMessageReaderAt.
172func (p *Part) SetReaderAt(r io.ReaderAt) {
173 if r == nil {
174 panic("nil reader")
175 }
176 p.r = r
177 for i := range p.Parts {
178 pp := &p.Parts[i]
179 pp.SetReaderAt(r)
180 }
181}
182
183// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.
184func (p *Part) SetMessageReaderAt() error {
185 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.
186 buf, err := io.ReadAll(p.Reader())
187 if err != nil {
188 return err
189 }
190 p.Message.SetReaderAt(bytes.NewReader(buf))
191 return nil
192}
193
194// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.
195func (p *Part) Walk(elog *slog.Logger, parent *Part) error {
196 log := mlog.New("message", elog)
197
198 if len(p.bound) == 0 {
199 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {
200 // todo: don't read whole submessage in memory...
201 buf, err := io.ReadAll(p.Reader())
202 if err != nil {
203 return err
204 }
205 br := bytes.NewReader(buf)
206 mp, err := Parse(log.Logger, p.strict, br)
207 if err != nil {
208 return fmt.Errorf("parsing embedded message: %w", err)
209 }
210 if err := mp.Walk(log.Logger, nil); err != nil {
211 // If this is a DSN and we are not in pedantic mode, accept unexpected end of
212 // message. This is quite common because MTA's sometimes just truncate the original
213 // message in a place that makes the message invalid.
214 if errors.Is(err, errUnexpectedEOF) && !Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {
215 mp, err = fallbackPart(mp, br, int64(len(buf)))
216 if err != nil {
217 return fmt.Errorf("parsing invalid embedded message: %w", err)
218 }
219 } else {
220 return fmt.Errorf("parsing parts of embedded message: %w", err)
221 }
222 }
223 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.
224 p.Message = &mp
225 return nil
226 }
227 _, err := io.Copy(io.Discard, p.Reader())
228 return err
229 }
230
231 for {
232 pp, err := p.ParseNextPart(log.Logger)
233 if err == io.EOF {
234 return nil
235 }
236 if err != nil {
237 return err
238 }
239 if err := pp.Walk(log.Logger, p); err != nil {
240 return err
241 }
242 }
243}
244
245// String returns a debugging representation of the part.
246func (p *Part) String() string {
247 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)
248}
249
250// newPart parses a new part, which can be the top-level message.
251// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.
252// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.
253func newPart(log mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
254 if r == nil {
255 panic("nil reader")
256 }
257 p = Part{
258 BoundaryOffset: -1,
259 EndOffset: -1,
260 r: r,
261 parent: parent,
262 strict: strict,
263 }
264
265 b := &bufAt{strict: strict, r: r, offset: offset}
266
267 if parent != nil {
268 p.BoundaryOffset = offset
269 if line, _, err := b.ReadLine(true); err != nil {
270 return p, err
271 } else if match, finish := checkBound(line, parent.bound); !match {
272 return p, fmt.Errorf("missing bound")
273 } else if finish {
274 return p, fmt.Errorf("new part for closing boundary")
275 }
276 }
277
278 // Collect header.
279 p.HeaderOffset = b.offset
280 p.BodyOffset = b.offset
281 hb := &bytes.Buffer{}
282 for {
283 line, _, err := b.ReadLine(true)
284 if err == io.EOF {
285 // No body is valid.
286 break
287 }
288 if err != nil {
289 return p, fmt.Errorf("reading header line: %w", err)
290 }
291 hb.Write(line)
292 if len(line) == 2 {
293 break // crlf
294 }
295 }
296 p.BodyOffset = b.offset
297
298 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.
299 if p.HeaderOffset == p.BodyOffset {
300 p.header = textproto.MIMEHeader{}
301 } else {
302 h, err := parseHeader(hb)
303 if err != nil {
304 return p, fmt.Errorf("parsing header: %w", err)
305 }
306 p.header = h
307 }
308
309 ct := p.header.Get("Content-Type")
310 mt, params, err := mime.ParseMediaType(ct)
311 if err != nil && ct != "" {
312 if Pedantic || strict {
313 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)
314 }
315
316 // Try parsing just a content-type, ignoring parameters.
317 // ../rfc/2045:628
318 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])
319 t := strings.SplitN(ct, "/", 2)
320 isToken := func(s string) bool {
321 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663
322 for _, c := range s {
323 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {
324 return false
325 }
326 }
327 return len(s) > 0
328 }
329 // We cannot recover content-type of multipart, we won't have a boundary.
330 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {
331 p.MediaType = strings.ToUpper(t[0])
332 p.MediaSubType = strings.ToUpper(t[1])
333 } else {
334 p.MediaType = "APPLICATION"
335 p.MediaSubType = "OCTET-STREAM"
336 }
337 log.Debugx("malformed content-type, attempting to recover and continuing", err,
338 slog.String("contenttype", p.header.Get("Content-Type")),
339 slog.String("mediatype", p.MediaType),
340 slog.String("mediasubtype", p.MediaSubType))
341 } else if mt != "" {
342 t := strings.SplitN(strings.ToUpper(mt), "/", 2)
343 if len(t) != 2 {
344 if Pedantic || strict {
345 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)
346 }
347 log.Debug("malformed media-type, ignoring and continuing", slog.String("type", mt))
348 p.MediaType = "APPLICATION"
349 p.MediaSubType = "OCTET-STREAM"
350 } else {
351 p.MediaType = t[0]
352 p.MediaSubType = t[1]
353 p.ContentTypeParams = params
354 }
355 }
356
357 p.ContentID = p.header.Get("Content-Id")
358 p.ContentDescription = p.header.Get("Content-Description")
359 p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))
360
361 if parent == nil {
362 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))
363 if err != nil {
364 return p, err
365 }
366 }
367
368 if p.MediaType == "MULTIPART" {
369 s := params["boundary"]
370 if s == "" {
371 return p, errMissingBoundaryParam
372 }
373 p.bound = append([]byte("--"), s...)
374
375 // Discard preamble, before first boundary.
376 for {
377 line, _, err := b.PeekLine(true)
378 if err != nil {
379 return p, fmt.Errorf("parsing line for part preamble: %w", err)
380 }
381 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103
382 // Well, for compatibility, we require whitespace after the boundary. Because some
383 // software use the same boundary but with text appended for sub parts.
384 if match, finish := checkBound(line, p.bound); match {
385 if finish {
386 return p, errFirstBoundCloses
387 }
388 break
389 }
390 b.ReadLine(true)
391 }
392 p.nextBoundOffset = b.offset
393 p.lastBoundOffset = b.offset
394 }
395
396 return p, nil
397}
398
399// Header returns the parsed header of this part.
400//
401// Returns a ErrHeader for messages with invalid header syntax.
402func (p *Part) Header() (textproto.MIMEHeader, error) {
403 if p.header != nil {
404 return p.header, nil
405 }
406 if p.HeaderOffset == p.BodyOffset {
407 p.header = textproto.MIMEHeader{}
408 return p.header, nil
409 }
410 h, err := parseHeader(p.HeaderReader())
411 p.header = h
412 return h, err
413}
414
415// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.
416func (p *Part) HeaderReader() io.Reader {
417 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)
418}
419
420// parse a header, only call this on non-empty input (even though that is a valid header).
421func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {
422 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the
423 // first handles email messages properly, while the second only works for HTTP
424 // headers.
425 var zero textproto.MIMEHeader
426
427 // We read the header and add the optional \r\n header/body separator. If the \r\n
428 // is missing, parsing with Go <1.21 results in an EOF error.
429 // todo: directly parse from reader r when Go 1.20 is no longer supported.
430 buf, err := io.ReadAll(r)
431 if err != nil {
432 return zero, err
433 }
434 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {
435 buf = append(buf, "\r\n"...)
436 }
437 msg, err := mail.ReadMessage(bytes.NewReader(buf))
438 if err != nil {
439 // Recognize parsing errors from net/mail.ReadMessage.
440 // todo: replace with own message parsing code that returns proper error types.
441 errstr := err.Error()
442 if strings.HasPrefix(errstr, "malformed initial line:") || strings.HasPrefix(errstr, "malformed header line:") {
443 err = fmt.Errorf("%w: %v", ErrHeader, err)
444 }
445 return zero, err
446 }
447 return textproto.MIMEHeader(msg.Header), nil
448}
449
450var wordDecoder = mime.WordDecoder{
451 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {
452 switch strings.ToLower(charset) {
453 case "", "us-ascii", "utf-8":
454 return r, nil
455 }
456 enc, _ := ianaindex.MIME.Encoding(charset)
457 if enc == nil {
458 enc, _ = ianaindex.IANA.Encoding(charset)
459 }
460 if enc == nil {
461 return r, fmt.Errorf("unknown charset %q", charset)
462 }
463 return enc.NewDecoder().Reader(r), nil
464 },
465}
466
467func parseEnvelope(log mlog.Log, h mail.Header) (*Envelope, error) {
468 date, _ := h.Date()
469
470 // We currently marshal this field to JSON. But JSON cannot represent all
471 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme
472 // years, but we can readjust timezones.
473 // todo: remove this once we no longer store using json.
474 _, offset := date.Zone()
475 if date.Year() > 9999 {
476 date = time.Time{}
477 } else if offset <= -24*3600 || offset >= 24*3600 {
478 date = time.Unix(date.Unix(), 0).UTC()
479 }
480
481 subject := h.Get("Subject")
482 if s, err := wordDecoder.DecodeHeader(subject); err == nil {
483 subject = s
484 }
485
486 env := &Envelope{
487 date,
488 subject,
489 parseAddressList(log, h, "from"),
490 parseAddressList(log, h, "sender"),
491 parseAddressList(log, h, "reply-to"),
492 parseAddressList(log, h, "to"),
493 parseAddressList(log, h, "cc"),
494 parseAddressList(log, h, "bcc"),
495 h.Get("In-Reply-To"),
496 h.Get("Message-Id"),
497 }
498 return env, nil
499}
500
501func parseAddressList(log mlog.Log, h mail.Header, k string) []Address {
502 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382
503 v := h.Get(k)
504 if v == "" {
505 return nil
506 }
507 parser := mail.AddressParser{WordDecoder: &wordDecoder}
508 l, err := parser.ParseList(v)
509 if err != nil {
510 return nil
511 }
512 var r []Address
513 for _, a := range l {
514 // todo: parse more fully according to ../rfc/5322:959
515 var user, host string
516 addr, err := smtp.ParseNetMailAddress(a.Address)
517 if err != nil {
518 log.Infox("parsing address (continuing)", err, slog.Any("netmailaddress", a.Address))
519 } else {
520 user = addr.Localpart.String()
521 host = addr.Domain.ASCII
522 }
523 r = append(r, Address{a.Name, user, host})
524 }
525 return r
526}
527
528// ParseNextPart parses the next (sub)part of this multipart message.
529// ParseNextPart returns io.EOF and a nil part when there are no more parts.
530// Only used for initial parsing of message. Once parsed, use p.Parts.
531func (p *Part) ParseNextPart(elog *slog.Logger) (*Part, error) {
532 log := mlog.New("message", elog)
533
534 if len(p.bound) == 0 {
535 return nil, errNotMultipart
536 }
537 if p.nextBoundOffset == -1 {
538 if enforceSequential {
539 panic("access not sequential")
540 }
541 // Set nextBoundOffset by fully reading the last part.
542 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)
543 if err != nil {
544 return nil, err
545 }
546 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {
547 return nil, err
548 }
549 if p.nextBoundOffset == -1 {
550 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")
551 }
552 }
553 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}
554 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.
555 line, crlf, err := b.ReadLine(false)
556 if err != nil {
557 return nil, err
558 }
559 if match, finish := checkBound(line, p.bound); !match {
560 return nil, fmt.Errorf("expected bound, got %q", line)
561 } else if finish {
562 // Read any trailing data.
563 if p.parent != nil {
564 for {
565 line, _, err := b.PeekLine(false)
566 if err != nil {
567 break
568 }
569 if match, _ := checkBound(line, p.parent.bound); match {
570 break
571 }
572 b.ReadLine(false)
573 }
574 if p.parent.lastBoundOffset == p.BoundaryOffset {
575 p.parent.nextBoundOffset = b.offset
576 }
577 }
578 p.EndOffset = b.offset
579 return nil, io.EOF
580 } else if !crlf {
581 return nil, fmt.Errorf("non-finishing bound without crlf: %w", errUnexpectedEOF)
582 }
583 boundOffset := p.nextBoundOffset
584 p.lastBoundOffset = boundOffset
585 p.nextBoundOffset = -1
586 np, err := newPart(log, p.strict, p.r, boundOffset, p)
587 if err != nil {
588 return nil, err
589 }
590 p.Parts = append(p.Parts, np)
591 return &p.Parts[len(p.Parts)-1], nil
592}
593
594// IsDSN returns whether the MIME structure of the part is a DSN.
595func (p *Part) IsDSN() bool {
596 return p.MediaType == "MULTIPART" &&
597 p.MediaSubType == "REPORT" &&
598 len(p.Parts) >= 2 &&
599 p.Parts[1].MediaType == "MESSAGE" &&
600 (p.Parts[1].MediaSubType == "DELIVERY-STATUS" || p.Parts[1].MediaSubType == "GLOBAL-DELIVERY-STATUS")
601}
602
603func hasNonASCII(r io.Reader) (bool, error) {
604 br := bufio.NewReader(r)
605 for {
606 b, err := br.ReadByte()
607 if err == io.EOF {
608 break
609 } else if err != nil {
610 return false, err
611 }
612 if b > unicode.MaxASCII {
613 return true, nil
614 }
615 }
616 return false, nil
617}
618
619// NeedsSMTPUTF8 returns whether the part needs the SMTPUTF8 extension to be
620// transported, due to non-ascii in message headers.
621func (p *Part) NeedsSMTPUTF8() (bool, error) {
622 if has, err := hasNonASCII(p.HeaderReader()); err != nil {
623 return false, fmt.Errorf("reading header: %w", err)
624 } else if has {
625 return true, nil
626 }
627 for _, pp := range p.Parts {
628 if has, err := pp.NeedsSMTPUTF8(); err != nil || has {
629 return has, err
630 }
631 }
632 return false, nil
633}
634
635var ErrParamEncoding = errors.New("bad header parameter encoding")
636
637// DispositionFilename tries to parse the disposition header and the "filename"
638// parameter. If the filename parameter is absent or can't be parsed, the "name"
639// parameter from the Content-Type header is used for the filename. The returned
640// filename is decoded according to RFC 2231 or RFC 2047. This is a best-effort
641// attempt to find a filename for a part. If no Content-Disposition header, or
642// filename was found, empty values without error are returned.
643//
644// If the returned error is an ErrParamEncoding, it can be treated as a diagnostic
645// and a filename may still be returned.
646func (p *Part) DispositionFilename() (disposition string, filename string, err error) {
647 h, err := p.Header()
648 if err != nil {
649 return "", "", fmt.Errorf("parsing header: %w", err)
650 }
651 var disp string
652 var params map[string]string
653 cd := h.Get("Content-Disposition")
654 if cd != "" {
655 disp, params, err = mime.ParseMediaType(cd)
656 }
657 if err != nil {
658 return "", "", fmt.Errorf("%w: parsing disposition header: %v", ErrParamEncoding, err)
659 }
660 filename, err = tryDecodeParam(params["filename"])
661 if filename == "" {
662 s, err2 := tryDecodeParam(p.ContentTypeParams["name"])
663 filename = s
664 if err == nil {
665 err = err2
666 }
667 }
668 return disp, filename, err
669}
670
671// Attempt q/b-word-decode name, coming from Content-Type "name" field or
672// Content-Disposition "filename" field.
673//
674// RFC 2231 specifies an encoding for non-ascii values in mime header parameters. But
675// it appears common practice to instead just q/b-word encode the values.
676// Thunderbird and gmail.com do this for the Content-Type "name" parameter.
677// gmail.com also does that for the Content-Disposition "filename" parameter, where
678// Thunderbird uses the RFC 2231-defined encoding. Go's mime.ParseMediaType parses
679// the mechanism specified in RFC 2231 only. The value for "name" we get here would
680// already be decoded properly for standards-compliant headers, like
681// "filename*0*=UTF-8”%...; filename*1*=%.... We'll look for Q/B-word encoding
682// markers ("=?"-prefix or "?="-suffix) and try to decode if present. This would
683// only cause trouble for filenames having this prefix/suffix.
684func tryDecodeParam(name string) (string, error) {
685 if name == "" || !strings.HasPrefix(name, "=?") && !strings.HasSuffix(name, "?=") {
686 return name, nil
687 }
688 // todo: find where this is allowed. it seems quite common. perhaps we should remove the pedantic check?
689 if Pedantic {
690 return name, fmt.Errorf("%w: attachment contains rfc2047 q/b-word-encoded mime parameter instead of rfc2231-encoded", ErrParamEncoding)
691 }
692 s, err := wordDecoder.DecodeHeader(name)
693 if err != nil {
694 return name, fmt.Errorf("%w: q/b-word decoding mime parameter: %v", ErrParamEncoding, err)
695 }
696 return s, nil
697}
698
699// Reader returns a reader for the decoded body content.
700func (p *Part) Reader() io.Reader {
701 return p.bodyReader(p.RawReader())
702}
703
704// ReaderUTF8OrBinary returns a reader for the decoded body content, transformed to
705// utf-8 for known mime/iana encodings (only if they aren't us-ascii or utf-8
706// already). For unknown or missing character sets/encodings, the original reader
707// is returned.
708func (p *Part) ReaderUTF8OrBinary() io.Reader {
709 return DecodeReader(p.ContentTypeParams["charset"], p.Reader())
710}
711
712func (p *Part) bodyReader(r io.Reader) io.Reader {
713 r = newDecoder(p.ContentTransferEncoding, r)
714 if p.MediaType == "TEXT" {
715 return &textReader{p, bufio.NewReader(r), 0, false}
716 }
717 return &countReader{p, r, 0}
718}
719
720// countReader is an io.Reader that passes Reads to the underlying reader.
721// when eof is read, it sets p.DecodedSize to the number of bytes returned.
722type countReader struct {
723 p *Part
724 r io.Reader
725 count int64
726}
727
728func (cr *countReader) Read(buf []byte) (int, error) {
729 n, err := cr.r.Read(buf)
730 if n >= 0 {
731 cr.count += int64(n)
732 }
733 if err == io.EOF {
734 cr.p.DecodedSize = cr.count
735 }
736 return n, err
737}
738
739// textReader is an io.Reader that ensures all lines return end in CRLF.
740// when eof is read from the underlying reader, it sets p.DecodedSize.
741type textReader struct {
742 p *Part
743 r *bufio.Reader
744 count int64
745 prevcr bool // If previous byte returned was a CR.
746}
747
748func (tr *textReader) Read(buf []byte) (int, error) {
749 o := 0
750 for o < len(buf) {
751 c, err := tr.r.ReadByte()
752 if err != nil {
753 tr.count += int64(o)
754 tr.p.DecodedSize = tr.count
755 return o, err
756 }
757 if c == '\n' && !tr.prevcr {
758 if err := tr.r.UnreadByte(); err != nil {
759 return o, err
760 }
761 buf[o] = '\r'
762 o++
763 tr.prevcr = true
764 continue
765 }
766 buf[o] = c
767 tr.prevcr = c == '\r'
768 o++
769 }
770 tr.count += int64(o)
771 return o, nil
772}
773
774func newDecoder(cte string, r io.Reader) io.Reader {
775 // ../rfc/2045:775
776 switch cte {
777 case "BASE64":
778 return base64.NewDecoder(base64.StdEncoding, r)
779 case "QUOTED-PRINTABLE":
780 return quotedprintable.NewReader(r)
781 }
782 return r
783}
784
785// RawReader returns a reader for the raw, undecoded body content. E.g. with
786// quoted-printable or base64 content intact.
787// Fully reading a part helps its parent part find its next part efficiently.
788func (p *Part) RawReader() io.Reader {
789 if p.r == nil {
790 panic("missing reader")
791 }
792 if p.EndOffset >= 0 {
793 return &crlfReader{strict: p.strict, r: io.NewSectionReader(p.r, p.BodyOffset, p.EndOffset-p.BodyOffset)}
794 }
795 p.RawLineCount = 0
796 if p.parent == nil {
797 return &offsetReader{p, p.BodyOffset, p.strict, true, false, 0}
798 }
799 return &boundReader{p: p, b: &bufAt{strict: p.strict, r: p.r, offset: p.BodyOffset}, prevlf: true}
800}
801
802// crlfReader verifies there are no bare newlines and optionally no bare carriage returns.
803type crlfReader struct {
804 r io.Reader
805 strict bool
806 prevcr bool
807}
808
809func (r *crlfReader) Read(buf []byte) (int, error) {
810 n, err := r.r.Read(buf)
811 if err == nil || err == io.EOF {
812 for _, b := range buf[:n] {
813 if b == '\n' && !r.prevcr {
814 err = errBareLF
815 break
816 } else if b != '\n' && r.prevcr && (r.strict || Pedantic) {
817 err = errBareCR
818 break
819 }
820 r.prevcr = b == '\r'
821 }
822 }
823 return n, err
824}
825
826// bufAt is a buffered reader on an underlying ReaderAt.
827// bufAt verifies that lines end with crlf.
828type bufAt struct {
829 offset int64 // Offset in r currently consumed, i.e. not including any buffered data.
830
831 strict bool
832 r io.ReaderAt
833 buf []byte // Buffered data.
834 nbuf int // Valid bytes in buf.
835 scratch []byte
836}
837
838// Messages should not have lines longer than 78+2 bytes, and must not have
839// lines longer than 998+2 bytes. But in practice they have longer lines. We
840// have a higher limit, but for when parsing with strict we check for the 1000
841// bytes limit.
842// ../rfc/5321:3512
843const maxLineLength = 8 * 1024
844
845func (b *bufAt) maxLineLength() int {
846 if b.strict || Pedantic {
847 return 1000
848 }
849 return maxLineLength
850}
851
852// ensure makes sure b.nbuf is up to maxLineLength, unless eof is encountered.
853func (b *bufAt) ensure() error {
854 if slices.Contains(b.buf[:b.nbuf], '\n') {
855 return nil
856 }
857 if b.scratch == nil {
858 b.scratch = make([]byte, b.maxLineLength())
859 }
860 if b.buf == nil {
861 b.buf = make([]byte, b.maxLineLength())
862 }
863 for b.nbuf < b.maxLineLength() {
864 n, err := b.r.ReadAt(b.buf[b.nbuf:], b.offset+int64(b.nbuf))
865 if n > 0 {
866 b.nbuf += n
867 }
868 if err != nil && err != io.EOF || err == io.EOF && b.nbuf+n == 0 {
869 return err
870 }
871 if n == 0 || err == io.EOF {
872 break
873 }
874 }
875 return nil
876}
877
878// ReadLine reads a line until \r\n is found, returning the line including \r\n.
879// If not found, or a bare \n is encountered, or a bare \r is enountered in pedantic mode, ReadLine returns an error.
880func (b *bufAt) ReadLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
881 return b.line(true, requirecrlf)
882}
883
884func (b *bufAt) PeekLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
885 return b.line(false, requirecrlf)
886}
887
888func (b *bufAt) line(consume, requirecrlf bool) (buf []byte, crlf bool, err error) {
889 if err := b.ensure(); err != nil {
890 return nil, false, err
891 }
892 for i, c := range b.buf[:b.nbuf] {
893 if c == '\n' {
894 // Should have seen a \r, which should have been handled below.
895 return nil, false, errBareLF
896 }
897 if c != '\r' {
898 continue
899 }
900 i++
901 if i >= b.nbuf || b.buf[i] != '\n' {
902 if b.strict || Pedantic {
903 return nil, false, errBareCR
904 }
905 continue
906 }
907 b.scratch = b.scratch[:i+1]
908 copy(b.scratch, b.buf[:i+1])
909 if consume {
910 copy(b.buf, b.buf[i+1:])
911 b.offset += int64(i + 1)
912 b.nbuf -= i + 1
913 }
914 return b.scratch, true, nil
915 }
916 if b.nbuf >= b.maxLineLength() {
917 return nil, false, errLineTooLong
918 }
919 if requirecrlf {
920 return nil, false, errUnexpectedEOF
921 }
922 b.scratch = b.scratch[:b.nbuf]
923 copy(b.scratch, b.buf[:b.nbuf])
924 if consume {
925 b.offset += int64(b.nbuf)
926 b.nbuf = 0
927 }
928 return b.scratch, false, nil
929}
930
931// PeekByte returns the next unread byte, or an error.
932func (b *bufAt) PeekByte() (byte, error) {
933 if err := b.ensure(); err != nil {
934 return 0, err
935 }
936 if b.nbuf == 0 {
937 return 0, io.EOF
938 }
939 return b.buf[0], nil
940}
941
942// offsetReader reads from p.r starting from offset, and RawLineCount on p.
943// offsetReader validates lines end with \r\n.
944type offsetReader struct {
945 p *Part
946 offset int64
947 strict bool
948 prevlf bool
949 prevcr bool
950 linelength int
951}
952
953func (r *offsetReader) Read(buf []byte) (int, error) {
954 n, err := r.p.r.ReadAt(buf, r.offset)
955 if n > 0 {
956 r.offset += int64(n)
957 max := maxLineLength
958 if r.strict || Pedantic {
959 max = 1000
960 }
961
962 for _, c := range buf[:n] {
963 if r.prevlf {
964 r.p.RawLineCount++
965 }
966 if err == nil || err == io.EOF {
967 if c == '\n' && !r.prevcr {
968 err = errBareLF
969 } else if c != '\n' && r.prevcr && (r.strict || Pedantic) {
970 err = errBareCR
971 }
972 }
973 r.prevlf = c == '\n'
974 r.prevcr = c == '\r'
975 r.linelength++
976 if c == '\n' {
977 r.linelength = 0
978 } else if r.linelength > max && err == nil {
979 err = errLineTooLong
980 }
981 }
982 }
983 if err == io.EOF {
984 r.p.EndOffset = r.offset
985 }
986 return n, err
987}
988
989var crlf = []byte("\r\n")
990
991// boundReader is a reader that stops at a closing multipart boundary.
992// boundReader ensures lines end with crlf through its use of bufAt.
993type boundReader struct {
994 p *Part
995 b *bufAt
996 buf []byte // Data from previous line, to be served first.
997 nbuf int // Number of valid bytes in buf.
998 crlf []byte // Possible crlf, to be returned if we do not yet encounter a boundary.
999 prevlf bool // If last char returned was a newline. For counting lines.
1000}
1001
1002func (b *boundReader) Read(buf []byte) (count int, rerr error) {
1003 origBuf := buf
1004 defer func() {
1005 if count > 0 {
1006 for _, c := range origBuf[:count] {
1007 if b.prevlf {
1008 b.p.RawLineCount++
1009 }
1010 b.prevlf = c == '\n'
1011 }
1012 }
1013 }()
1014
1015 for {
1016 // Read data from earlier line.
1017 if b.nbuf > 0 {
1018 n := min(b.nbuf, len(buf))
1019 copy(buf, b.buf[:n])
1020 copy(b.buf, b.buf[n:])
1021 buf = buf[n:]
1022 b.nbuf -= n
1023 count += n
1024 if b.nbuf > 0 {
1025 break
1026 }
1027 }
1028
1029 // Look at next line. If it is a boundary, we are done and won't serve the crlf from the last line.
1030 line, _, err := b.b.PeekLine(false)
1031 if match, _ := checkBound(line, b.p.parent.bound); match {
1032 b.p.EndOffset = b.b.offset - int64(len(b.crlf))
1033 if b.p.parent.lastBoundOffset == b.p.BoundaryOffset {
1034 b.p.parent.nextBoundOffset = b.b.offset
1035 } else if enforceSequential {
1036 panic("access not sequential")
1037 }
1038 return count, io.EOF
1039 }
1040 if err == io.EOF {
1041 err = errMissingClosingBoundary
1042 }
1043 if err != nil && err != io.EOF {
1044 return count, err
1045 }
1046 if len(b.crlf) > 0 {
1047 n := min(len(b.crlf), len(buf))
1048 copy(buf, b.crlf[:n])
1049 count += n
1050 buf = buf[n:]
1051 b.crlf = b.crlf[n:]
1052 }
1053 if len(buf) == 0 {
1054 break
1055 }
1056 line, _, err = b.b.ReadLine(true)
1057 if err != nil {
1058 // Could be an unexpected end of the part.
1059 return 0, err
1060 }
1061 b.crlf = crlf // crlf will be read next time, but not if a boundary follows.
1062 n := len(line) - 2
1063 line = line[:n]
1064 if n > len(buf) {
1065 n = len(buf)
1066 }
1067 copy(buf, line[:n])
1068 count += n
1069 buf = buf[n:]
1070 line = line[n:]
1071 if len(line) > 0 {
1072 if b.buf == nil {
1073 b.buf = make([]byte, b.b.maxLineLength())
1074 }
1075 copy(b.buf, line)
1076 b.nbuf = len(line)
1077 }
1078 }
1079 return count, nil
1080}
1081
1082func checkBound(line, bound []byte) (bool, bool) {
1083 if !bytes.HasPrefix(line, bound) {
1084 return false, false
1085 }
1086 line = line[len(bound):]
1087 if bytes.HasPrefix(line, []byte("--")) {
1088 return true, true
1089 }
1090 if len(line) == 0 {
1091 return true, false
1092 }
1093 c := line[0]
1094 switch c {
1095 case ' ', '\t', '\r', '\n':
1096 return true, false
1097 }
1098 return false, false
1099}
1100