1package message
2
3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").
4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...
5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.
6// todo: handle comments in headers?
7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.
8// todo: can use a cleanup
9
10import (
11 "bufio"
12 "bytes"
13 "encoding/base64"
14 "errors"
15 "fmt"
16 "io"
17 "log/slog"
18 "mime"
19 "mime/quotedprintable"
20 "net/mail"
21 "net/textproto"
22 "strings"
23 "time"
24 "unicode"
25
26 "golang.org/x/text/encoding/ianaindex"
27
28 "github.com/mjl-/mox/mlog"
29 "github.com/mjl-/mox/smtp"
30)
31
32// Pedantic enables stricter parsing.
33var Pedantic bool
34
35var (
36 ErrBadContentType = errors.New("bad content-type")
37 ErrHeader = errors.New("bad message header")
38)
39
40var (
41 errNotMultipart = errors.New("not a multipart message")
42 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")
43 errLineTooLong = errors.New("line too long")
44 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")
45 errMissingClosingBoundary = errors.New("eof without closing boundary")
46 errBareLF = errors.New("invalid bare line feed")
47 errBareCR = errors.New("invalid bare carriage return")
48 errUnexpectedEOF = errors.New("unexpected eof")
49)
50
51// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.
52var enforceSequential bool
53
54// Part represents a whole mail message, or a part of a multipart message. It
55// is designed to handle IMAP requirements efficiently.
56type Part struct {
57 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.
58 HeaderOffset int64 // Offset in message file where header starts.
59 BodyOffset int64 // Offset in message file where body starts.
60 EndOffset int64 // Where body of part ends. Set when part is fully read.
61 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.
62 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.
63
64 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.
65 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".
66 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.
67 ContentID string
68 ContentDescription string
69 ContentTransferEncoding string // In upper case.
70 Envelope *Envelope // Email message headers. Not for non-message parts.
71
72 Parts []Part // Parts if this is a multipart.
73
74 // Only for message/rfc822 and message/global. This part may have a buffer as
75 // backing io.ReaderAt, because a message/global can have a non-identity
76 // content-transfer-encoding. This part has a nil parent.
77 Message *Part
78
79 r io.ReaderAt
80 header textproto.MIMEHeader // Parsed header.
81 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.
82 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.
83 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.
84 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.
85 strict bool // If set, valid crlf line endings are verified when reading body.
86}
87
88// todo: have all Content* fields in Part?
89// todo: make Address contain a type Localpart and dns.Domain?
90// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.
91
92// Envelope holds the basic/common message headers as used in IMAP4.
93type Envelope struct {
94 Date time.Time
95 Subject string // Q/B-word-decoded.
96 From []Address
97 Sender []Address
98 ReplyTo []Address
99 To []Address
100 CC []Address
101 BCC []Address
102 InReplyTo string // From In-Reply-To header, includes <>.
103 MessageID string // From Message-Id header, includes <>.
104}
105
106// Address as used in From and To headers.
107type Address struct {
108 Name string // Free-form name for display in mail applications.
109 User string // Localpart, encoded as string. Must be parsed before using as Localpart.
110 Host string // Domain in ASCII.
111}
112
113// Parse reads the headers of the mail message and returns a part.
114// A part provides access to decoded and raw contents of a message and its multiple parts.
115//
116// If strict is set, fewer attempts are made to continue parsing when errors are
117// encountered, such as with invalid content-type headers or bare carriage returns.
118func Parse(elog *slog.Logger, strict bool, r io.ReaderAt) (Part, error) {
119 log := mlog.New("message", elog)
120 return newPart(log, strict, r, 0, nil)
121}
122
123// EnsurePart parses a part as with Parse, but ensures a usable part is always
124// returned, even if error is non-nil. If a parse error occurs, the message is
125// returned as application/octet-stream, and headers can still be read if they
126// were valid.
127//
128// If strict is set, fewer attempts are made to continue parsing when errors are
129// encountered, such as with invalid content-type headers or bare carriage returns.
130func EnsurePart(elog *slog.Logger, strict bool, r io.ReaderAt, size int64) (Part, error) {
131 log := mlog.New("message", elog)
132 p, err := Parse(log.Logger, strict, r)
133 if err == nil {
134 err = p.Walk(log.Logger, nil)
135 }
136 if err != nil {
137 np, err2 := fallbackPart(p, r, size)
138 if err2 != nil {
139 err = err2
140 }
141 p = np
142 }
143 return p, err
144}
145
146func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {
147 np := Part{
148 HeaderOffset: p.HeaderOffset,
149 BodyOffset: p.BodyOffset,
150 EndOffset: size,
151 MediaType: "APPLICATION",
152 MediaSubType: "OCTET-STREAM",
153 ContentTypeParams: p.ContentTypeParams,
154 ContentID: p.ContentID,
155 ContentDescription: p.ContentDescription,
156 ContentTransferEncoding: p.ContentTransferEncoding,
157 Envelope: p.Envelope,
158 // We don't keep:
159 // - BoundaryOffset: irrelevant for top-level message.
160 // - RawLineCount and DecodedSize: set below.
161 // - Parts: we are not treating this as a multipart message.
162 }
163 np.SetReaderAt(r)
164 // By reading body, the number of lines and decoded size will be set.
165 _, err := io.Copy(io.Discard, np.Reader())
166 return np, err
167}
168
169// SetReaderAt sets r as reader for this part and all its sub parts, recursively.
170// No reader is set for any Message subpart, see SetMessageReaderAt.
171func (p *Part) SetReaderAt(r io.ReaderAt) {
172 if r == nil {
173 panic("nil reader")
174 }
175 p.r = r
176 for i := range p.Parts {
177 pp := &p.Parts[i]
178 pp.SetReaderAt(r)
179 }
180}
181
182// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.
183func (p *Part) SetMessageReaderAt() error {
184 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.
185 buf, err := io.ReadAll(p.Reader())
186 if err != nil {
187 return err
188 }
189 p.Message.SetReaderAt(bytes.NewReader(buf))
190 return nil
191}
192
193// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.
194func (p *Part) Walk(elog *slog.Logger, parent *Part) error {
195 log := mlog.New("message", elog)
196
197 if len(p.bound) == 0 {
198 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {
199 // todo: don't read whole submessage in memory...
200 buf, err := io.ReadAll(p.Reader())
201 if err != nil {
202 return err
203 }
204 br := bytes.NewReader(buf)
205 mp, err := Parse(log.Logger, p.strict, br)
206 if err != nil {
207 return fmt.Errorf("parsing embedded message: %w", err)
208 }
209 if err := mp.Walk(log.Logger, nil); err != nil {
210 // If this is a DSN and we are not in pedantic mode, accept unexpected end of
211 // message. This is quite common because MTA's sometimes just truncate the original
212 // message in a place that makes the message invalid.
213 if errors.Is(err, errUnexpectedEOF) && !Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {
214 mp, err = fallbackPart(mp, br, int64(len(buf)))
215 if err != nil {
216 return fmt.Errorf("parsing invalid embedded message: %w", err)
217 }
218 } else {
219 return fmt.Errorf("parsing parts of embedded message: %w", err)
220 }
221 }
222 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.
223 p.Message = &mp
224 return nil
225 }
226 _, err := io.Copy(io.Discard, p.Reader())
227 return err
228 }
229
230 for {
231 pp, err := p.ParseNextPart(log.Logger)
232 if err == io.EOF {
233 return nil
234 }
235 if err != nil {
236 return err
237 }
238 if err := pp.Walk(log.Logger, p); err != nil {
239 return err
240 }
241 }
242}
243
244// String returns a debugging representation of the part.
245func (p *Part) String() string {
246 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)
247}
248
249// newPart parses a new part, which can be the top-level message.
250// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.
251// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.
252func newPart(log mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
253 if r == nil {
254 panic("nil reader")
255 }
256 p = Part{
257 BoundaryOffset: -1,
258 EndOffset: -1,
259 r: r,
260 parent: parent,
261 strict: strict,
262 }
263
264 b := &bufAt{strict: strict, r: r, offset: offset}
265
266 if parent != nil {
267 p.BoundaryOffset = offset
268 if line, _, err := b.ReadLine(true); err != nil {
269 return p, err
270 } else if match, finish := checkBound(line, parent.bound); !match {
271 return p, fmt.Errorf("missing bound")
272 } else if finish {
273 return p, fmt.Errorf("new part for closing boundary")
274 }
275 }
276
277 // Collect header.
278 p.HeaderOffset = b.offset
279 p.BodyOffset = b.offset
280 hb := &bytes.Buffer{}
281 for {
282 line, _, err := b.ReadLine(true)
283 if err == io.EOF {
284 // No body is valid.
285 break
286 }
287 if err != nil {
288 return p, fmt.Errorf("reading header line: %w", err)
289 }
290 hb.Write(line)
291 if len(line) == 2 {
292 break // crlf
293 }
294 }
295 p.BodyOffset = b.offset
296
297 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.
298 if p.HeaderOffset == p.BodyOffset {
299 p.header = textproto.MIMEHeader{}
300 } else {
301 h, err := parseHeader(hb)
302 if err != nil {
303 return p, fmt.Errorf("parsing header: %w", err)
304 }
305 p.header = h
306 }
307
308 ct := p.header.Get("Content-Type")
309 mt, params, err := mime.ParseMediaType(ct)
310 if err != nil && ct != "" {
311 if Pedantic || strict {
312 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)
313 }
314
315 // Try parsing just a content-type, ignoring parameters.
316 // ../rfc/2045:628
317 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])
318 t := strings.SplitN(ct, "/", 2)
319 isToken := func(s string) bool {
320 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663
321 for _, c := range s {
322 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {
323 return false
324 }
325 }
326 return len(s) > 0
327 }
328 // We cannot recover content-type of multipart, we won't have a boundary.
329 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {
330 p.MediaType = strings.ToUpper(t[0])
331 p.MediaSubType = strings.ToUpper(t[1])
332 } else {
333 p.MediaType = "APPLICATION"
334 p.MediaSubType = "OCTET-STREAM"
335 }
336 log.Debugx("malformed content-type, attempting to recover and continuing", err,
337 slog.String("contenttype", p.header.Get("Content-Type")),
338 slog.String("mediatype", p.MediaType),
339 slog.String("mediasubtype", p.MediaSubType))
340 } else if mt != "" {
341 t := strings.SplitN(strings.ToUpper(mt), "/", 2)
342 if len(t) != 2 {
343 if Pedantic || strict {
344 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)
345 }
346 log.Debug("malformed media-type, ignoring and continuing", slog.String("type", mt))
347 p.MediaType = "APPLICATION"
348 p.MediaSubType = "OCTET-STREAM"
349 } else {
350 p.MediaType = t[0]
351 p.MediaSubType = t[1]
352 p.ContentTypeParams = params
353 }
354 }
355
356 p.ContentID = p.header.Get("Content-Id")
357 p.ContentDescription = p.header.Get("Content-Description")
358 p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))
359
360 if parent == nil {
361 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))
362 if err != nil {
363 return p, err
364 }
365 }
366
367 if p.MediaType == "MULTIPART" {
368 s := params["boundary"]
369 if s == "" {
370 return p, errMissingBoundaryParam
371 }
372 p.bound = append([]byte("--"), s...)
373
374 // Discard preamble, before first boundary.
375 for {
376 line, _, err := b.PeekLine(true)
377 if err != nil {
378 return p, fmt.Errorf("parsing line for part preamble: %w", err)
379 }
380 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103
381 // Well, for compatibility, we require whitespace after the boundary. Because some
382 // software use the same boundary but with text appended for sub parts.
383 if match, finish := checkBound(line, p.bound); match {
384 if finish {
385 return p, errFirstBoundCloses
386 }
387 break
388 }
389 b.ReadLine(true)
390 }
391 p.nextBoundOffset = b.offset
392 p.lastBoundOffset = b.offset
393 }
394
395 return p, nil
396}
397
398// Header returns the parsed header of this part.
399//
400// Returns a ErrHeader for messages with invalid header syntax.
401func (p *Part) Header() (textproto.MIMEHeader, error) {
402 if p.header != nil {
403 return p.header, nil
404 }
405 if p.HeaderOffset == p.BodyOffset {
406 p.header = textproto.MIMEHeader{}
407 return p.header, nil
408 }
409 h, err := parseHeader(p.HeaderReader())
410 p.header = h
411 return h, err
412}
413
414// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.
415func (p *Part) HeaderReader() io.Reader {
416 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)
417}
418
419// parse a header, only call this on non-empty input (even though that is a valid header).
420func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {
421 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the
422 // first handles email messages properly, while the second only works for HTTP
423 // headers.
424 var zero textproto.MIMEHeader
425
426 // We read the header and add the optional \r\n header/body separator. If the \r\n
427 // is missing, parsing with Go <1.21 results in an EOF error.
428 // todo: directly parse from reader r when Go 1.20 is no longer supported.
429 buf, err := io.ReadAll(r)
430 if err != nil {
431 return zero, err
432 }
433 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {
434 buf = append(buf, "\r\n"...)
435 }
436 msg, err := mail.ReadMessage(bytes.NewReader(buf))
437 if err != nil {
438 // Recognize parsing errors from net/mail.ReadMessage.
439 // todo: replace with own message parsing code that returns proper error types.
440 errstr := err.Error()
441 if strings.HasPrefix(errstr, "malformed initial line:") || strings.HasPrefix(errstr, "malformed header line:") {
442 err = fmt.Errorf("%w: %v", ErrHeader, err)
443 }
444 return zero, err
445 }
446 return textproto.MIMEHeader(msg.Header), nil
447}
448
449var wordDecoder = mime.WordDecoder{
450 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {
451 switch strings.ToLower(charset) {
452 case "", "us-ascii", "utf-8":
453 return r, nil
454 }
455 enc, _ := ianaindex.MIME.Encoding(charset)
456 if enc == nil {
457 enc, _ = ianaindex.IANA.Encoding(charset)
458 }
459 if enc == nil {
460 return r, fmt.Errorf("unknown charset %q", charset)
461 }
462 return enc.NewDecoder().Reader(r), nil
463 },
464}
465
466func parseEnvelope(log mlog.Log, h mail.Header) (*Envelope, error) {
467 date, _ := h.Date()
468
469 // We currently marshal this field to JSON. But JSON cannot represent all
470 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme
471 // years, but we can readjust timezones.
472 // todo: remove this once we no longer store using json.
473 _, offset := date.Zone()
474 if date.Year() > 9999 {
475 date = time.Time{}
476 } else if offset <= -24*3600 || offset >= 24*3600 {
477 date = time.Unix(date.Unix(), 0).UTC()
478 }
479
480 subject := h.Get("Subject")
481 if s, err := wordDecoder.DecodeHeader(subject); err == nil {
482 subject = s
483 }
484
485 env := &Envelope{
486 date,
487 subject,
488 parseAddressList(log, h, "from"),
489 parseAddressList(log, h, "sender"),
490 parseAddressList(log, h, "reply-to"),
491 parseAddressList(log, h, "to"),
492 parseAddressList(log, h, "cc"),
493 parseAddressList(log, h, "bcc"),
494 h.Get("In-Reply-To"),
495 h.Get("Message-Id"),
496 }
497 return env, nil
498}
499
500func parseAddressList(log mlog.Log, h mail.Header, k string) []Address {
501 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382
502 v := h.Get(k)
503 if v == "" {
504 return nil
505 }
506 parser := mail.AddressParser{WordDecoder: &wordDecoder}
507 l, err := parser.ParseList(v)
508 if err != nil {
509 return nil
510 }
511 var r []Address
512 for _, a := range l {
513 // todo: parse more fully according to ../rfc/5322:959
514 var user, host string
515 addr, err := smtp.ParseNetMailAddress(a.Address)
516 if err != nil {
517 log.Infox("parsing address (continuing)", err, slog.Any("netmailaddress", a.Address))
518 } else {
519 user = addr.Localpart.String()
520 host = addr.Domain.ASCII
521 }
522 r = append(r, Address{a.Name, user, host})
523 }
524 return r
525}
526
527// ParseNextPart parses the next (sub)part of this multipart message.
528// ParseNextPart returns io.EOF and a nil part when there are no more parts.
529// Only used for initial parsing of message. Once parsed, use p.Parts.
530func (p *Part) ParseNextPart(elog *slog.Logger) (*Part, error) {
531 log := mlog.New("message", elog)
532
533 if len(p.bound) == 0 {
534 return nil, errNotMultipart
535 }
536 if p.nextBoundOffset == -1 {
537 if enforceSequential {
538 panic("access not sequential")
539 }
540 // Set nextBoundOffset by fully reading the last part.
541 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)
542 if err != nil {
543 return nil, err
544 }
545 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {
546 return nil, err
547 }
548 if p.nextBoundOffset == -1 {
549 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")
550 }
551 }
552 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}
553 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.
554 line, crlf, err := b.ReadLine(false)
555 if err != nil {
556 return nil, err
557 }
558 if match, finish := checkBound(line, p.bound); !match {
559 return nil, fmt.Errorf("expected bound, got %q", line)
560 } else if finish {
561 // Read any trailing data.
562 if p.parent != nil {
563 for {
564 line, _, err := b.PeekLine(false)
565 if err != nil {
566 break
567 }
568 if match, _ := checkBound(line, p.parent.bound); match {
569 break
570 }
571 b.ReadLine(false)
572 }
573 if p.parent.lastBoundOffset == p.BoundaryOffset {
574 p.parent.nextBoundOffset = b.offset
575 }
576 }
577 p.EndOffset = b.offset
578 return nil, io.EOF
579 } else if !crlf {
580 return nil, fmt.Errorf("non-finishing bound without crlf: %w", errUnexpectedEOF)
581 }
582 boundOffset := p.nextBoundOffset
583 p.lastBoundOffset = boundOffset
584 p.nextBoundOffset = -1
585 np, err := newPart(log, p.strict, p.r, boundOffset, p)
586 if err != nil {
587 return nil, err
588 }
589 p.Parts = append(p.Parts, np)
590 return &p.Parts[len(p.Parts)-1], nil
591}
592
593// IsDSN returns whether the MIME structure of the part is a DSN.
594func (p *Part) IsDSN() bool {
595 return p.MediaType == "MULTIPART" &&
596 p.MediaSubType == "REPORT" &&
597 len(p.Parts) >= 2 &&
598 p.Parts[1].MediaType == "MESSAGE" &&
599 (p.Parts[1].MediaSubType == "DELIVERY-STATUS" || p.Parts[1].MediaSubType == "GLOBAL-DELIVERY-STATUS")
600}
601
602func hasNonASCII(r io.Reader) (bool, error) {
603 br := bufio.NewReader(r)
604 for {
605 b, err := br.ReadByte()
606 if err == io.EOF {
607 break
608 } else if err != nil {
609 return false, err
610 }
611 if b > unicode.MaxASCII {
612 return true, nil
613 }
614 }
615 return false, nil
616}
617
618// NeedsSMTPUTF8 returns whether the part needs the SMTPUTF8 extension to be
619// transported, due to non-ascii in message headers.
620func (p *Part) NeedsSMTPUTF8() (bool, error) {
621 if has, err := hasNonASCII(p.HeaderReader()); err != nil {
622 return false, fmt.Errorf("reading header: %w", err)
623 } else if has {
624 return true, nil
625 }
626 for _, pp := range p.Parts {
627 if has, err := pp.NeedsSMTPUTF8(); err != nil || has {
628 return has, err
629 }
630 }
631 return false, nil
632}
633
634var ErrParamEncoding = errors.New("bad header parameter encoding")
635
636// DispositionFilename tries to parse the disposition header and the "filename"
637// parameter. If the filename parameter is absent or can't be parsed, the "name"
638// parameter from the Content-Type header is used for the filename. The returned
639// filename is decoded according to RFC 2231 or RFC 2047. This is a best-effort
640// attempt to find a filename for a part. If no Content-Disposition header, or
641// filename was found, empty values without error are returned.
642//
643// If the returned error is an ErrParamEncoding, it can be treated as a diagnostic
644// and a filename may still be returned.
645func (p *Part) DispositionFilename() (disposition string, filename string, err error) {
646 h, err := p.Header()
647 if err != nil {
648 return "", "", fmt.Errorf("parsing header: %v", err)
649 }
650 var disp string
651 var params map[string]string
652 cd := h.Get("Content-Disposition")
653 if cd != "" {
654 disp, params, err = mime.ParseMediaType(cd)
655 }
656 if err != nil {
657 return "", "", fmt.Errorf("%w: parsing disposition header: %v", ErrParamEncoding, err)
658 }
659 filename, err = tryDecodeParam(params["filename"])
660 if filename == "" {
661 s, err2 := tryDecodeParam(p.ContentTypeParams["name"])
662 filename = s
663 if err == nil {
664 err = err2
665 }
666 }
667 return disp, filename, err
668}
669
670// Attempt q/b-word-decode name, coming from Content-Type "name" field or
671// Content-Disposition "filename" field.
672//
673// RFC 2231 specifies an encoding for non-ascii values in mime header parameters. But
674// it appears common practice to instead just q/b-word encode the values.
675// Thunderbird and gmail.com do this for the Content-Type "name" parameter.
676// gmail.com also does that for the Content-Disposition "filename" parameter, where
677// Thunderbird uses the RFC 2231-defined encoding. Go's mime.ParseMediaType parses
678// the mechanism specified in RFC 2231 only. The value for "name" we get here would
679// already be decoded properly for standards-compliant headers, like
680// "filename*0*=UTF-8”%...; filename*1*=%.... We'll look for Q/B-word encoding
681// markers ("=?"-prefix or "?="-suffix) and try to decode if present. This would
682// only cause trouble for filenames having this prefix/suffix.
683func tryDecodeParam(name string) (string, error) {
684 if name == "" || !strings.HasPrefix(name, "=?") && !strings.HasSuffix(name, "?=") {
685 return name, nil
686 }
687 // todo: find where this is allowed. it seems quite common. perhaps we should remove the pedantic check?
688 if Pedantic {
689 return name, fmt.Errorf("%w: attachment contains rfc2047 q/b-word-encoded mime parameter instead of rfc2231-encoded", ErrParamEncoding)
690 }
691 s, err := wordDecoder.DecodeHeader(name)
692 if err != nil {
693 return name, fmt.Errorf("%w: q/b-word decoding mime parameter: %v", ErrParamEncoding, err)
694 }
695 return s, nil
696}
697
698// Reader returns a reader for the decoded body content.
699func (p *Part) Reader() io.Reader {
700 return p.bodyReader(p.RawReader())
701}
702
703// ReaderUTF8OrBinary returns a reader for the decoded body content, transformed to
704// utf-8 for known mime/iana encodings (only if they aren't us-ascii or utf-8
705// already). For unknown or missing character sets/encodings, the original reader
706// is returned.
707func (p *Part) ReaderUTF8OrBinary() io.Reader {
708 return DecodeReader(p.ContentTypeParams["charset"], p.Reader())
709}
710
711func (p *Part) bodyReader(r io.Reader) io.Reader {
712 r = newDecoder(p.ContentTransferEncoding, r)
713 if p.MediaType == "TEXT" {
714 return &textReader{p, bufio.NewReader(r), 0, false}
715 }
716 return &countReader{p, r, 0}
717}
718
719// countReader is an io.Reader that passes Reads to the underlying reader.
720// when eof is read, it sets p.DecodedSize to the number of bytes returned.
721type countReader struct {
722 p *Part
723 r io.Reader
724 count int64
725}
726
727func (cr *countReader) Read(buf []byte) (int, error) {
728 n, err := cr.r.Read(buf)
729 if n >= 0 {
730 cr.count += int64(n)
731 }
732 if err == io.EOF {
733 cr.p.DecodedSize = cr.count
734 }
735 return n, err
736}
737
738// textReader is an io.Reader that ensures all lines return end in CRLF.
739// when eof is read from the underlying reader, it sets p.DecodedSize.
740type textReader struct {
741 p *Part
742 r *bufio.Reader
743 count int64
744 prevcr bool // If previous byte returned was a CR.
745}
746
747func (tr *textReader) Read(buf []byte) (int, error) {
748 o := 0
749 for o < len(buf) {
750 c, err := tr.r.ReadByte()
751 if err != nil {
752 tr.count += int64(o)
753 tr.p.DecodedSize = tr.count
754 return o, err
755 }
756 if c == '\n' && !tr.prevcr {
757 buf[o] = '\r'
758 o++
759 tr.prevcr = true
760 tr.r.UnreadByte()
761 continue
762 }
763 buf[o] = c
764 tr.prevcr = c == '\r'
765 o++
766 }
767 tr.count += int64(o)
768 return o, nil
769}
770
771func newDecoder(cte string, r io.Reader) io.Reader {
772 // ../rfc/2045:775
773 switch cte {
774 case "BASE64":
775 return base64.NewDecoder(base64.StdEncoding, r)
776 case "QUOTED-PRINTABLE":
777 return quotedprintable.NewReader(r)
778 }
779 return r
780}
781
782// RawReader returns a reader for the raw, undecoded body content. E.g. with
783// quoted-printable or base64 content intact.
784// Fully reading a part helps its parent part find its next part efficiently.
785func (p *Part) RawReader() io.Reader {
786 if p.r == nil {
787 panic("missing reader")
788 }
789 if p.EndOffset >= 0 {
790 return &crlfReader{strict: p.strict, r: io.NewSectionReader(p.r, p.BodyOffset, p.EndOffset-p.BodyOffset)}
791 }
792 p.RawLineCount = 0
793 if p.parent == nil {
794 return &offsetReader{p, p.BodyOffset, p.strict, true, false, 0}
795 }
796 return &boundReader{p: p, b: &bufAt{strict: p.strict, r: p.r, offset: p.BodyOffset}, prevlf: true}
797}
798
799// crlfReader verifies there are no bare newlines and optionally no bare carriage returns.
800type crlfReader struct {
801 r io.Reader
802 strict bool
803 prevcr bool
804}
805
806func (r *crlfReader) Read(buf []byte) (int, error) {
807 n, err := r.r.Read(buf)
808 if err == nil || err == io.EOF {
809 for _, b := range buf[:n] {
810 if b == '\n' && !r.prevcr {
811 err = errBareLF
812 break
813 } else if b != '\n' && r.prevcr && (r.strict || Pedantic) {
814 err = errBareCR
815 break
816 }
817 r.prevcr = b == '\r'
818 }
819 }
820 return n, err
821}
822
823// bufAt is a buffered reader on an underlying ReaderAt.
824// bufAt verifies that lines end with crlf.
825type bufAt struct {
826 offset int64 // Offset in r currently consumed, i.e. not including any buffered data.
827
828 strict bool
829 r io.ReaderAt
830 buf []byte // Buffered data.
831 nbuf int // Valid bytes in buf.
832 scratch []byte
833}
834
835// Messages should not have lines longer than 78+2 bytes, and must not have
836// lines longer than 998+2 bytes. But in practice they have longer lines. We
837// have a higher limit, but for when parsing with strict we check for the 1000
838// bytes limit.
839// ../rfc/5321:3512
840const maxLineLength = 8 * 1024
841
842func (b *bufAt) maxLineLength() int {
843 if b.strict || Pedantic {
844 return 1000
845 }
846 return maxLineLength
847}
848
849// ensure makes sure b.nbuf is up to maxLineLength, unless eof is encountered.
850func (b *bufAt) ensure() error {
851 for _, c := range b.buf[:b.nbuf] {
852 if c == '\n' {
853 return nil
854 }
855 }
856 if b.scratch == nil {
857 b.scratch = make([]byte, b.maxLineLength())
858 }
859 if b.buf == nil {
860 b.buf = make([]byte, b.maxLineLength())
861 }
862 for b.nbuf < b.maxLineLength() {
863 n, err := b.r.ReadAt(b.buf[b.nbuf:], b.offset+int64(b.nbuf))
864 if n > 0 {
865 b.nbuf += n
866 }
867 if err != nil && err != io.EOF || err == io.EOF && b.nbuf+n == 0 {
868 return err
869 }
870 if n == 0 || err == io.EOF {
871 break
872 }
873 }
874 return nil
875}
876
877// ReadLine reads a line until \r\n is found, returning the line including \r\n.
878// If not found, or a bare \n is encountered, or a bare \r is enountered in pedantic mode, ReadLine returns an error.
879func (b *bufAt) ReadLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
880 return b.line(true, requirecrlf)
881}
882
883func (b *bufAt) PeekLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
884 return b.line(false, requirecrlf)
885}
886
887func (b *bufAt) line(consume, requirecrlf bool) (buf []byte, crlf bool, err error) {
888 if err := b.ensure(); err != nil {
889 return nil, false, err
890 }
891 for i, c := range b.buf[:b.nbuf] {
892 if c == '\n' {
893 // Should have seen a \r, which should have been handled below.
894 return nil, false, errBareLF
895 }
896 if c != '\r' {
897 continue
898 }
899 i++
900 if i >= b.nbuf || b.buf[i] != '\n' {
901 if b.strict || Pedantic {
902 return nil, false, errBareCR
903 }
904 continue
905 }
906 b.scratch = b.scratch[:i+1]
907 copy(b.scratch, b.buf[:i+1])
908 if consume {
909 copy(b.buf, b.buf[i+1:])
910 b.offset += int64(i + 1)
911 b.nbuf -= i + 1
912 }
913 return b.scratch, true, nil
914 }
915 if b.nbuf >= b.maxLineLength() {
916 return nil, false, errLineTooLong
917 }
918 if requirecrlf {
919 return nil, false, errUnexpectedEOF
920 }
921 b.scratch = b.scratch[:b.nbuf]
922 copy(b.scratch, b.buf[:b.nbuf])
923 if consume {
924 b.offset += int64(b.nbuf)
925 b.nbuf = 0
926 }
927 return b.scratch, false, nil
928}
929
930// PeekByte returns the next unread byte, or an error.
931func (b *bufAt) PeekByte() (byte, error) {
932 if err := b.ensure(); err != nil {
933 return 0, err
934 }
935 if b.nbuf == 0 {
936 return 0, io.EOF
937 }
938 return b.buf[0], nil
939}
940
941// offsetReader reads from p.r starting from offset, and RawLineCount on p.
942// offsetReader validates lines end with \r\n.
943type offsetReader struct {
944 p *Part
945 offset int64
946 strict bool
947 prevlf bool
948 prevcr bool
949 linelength int
950}
951
952func (r *offsetReader) Read(buf []byte) (int, error) {
953 n, err := r.p.r.ReadAt(buf, r.offset)
954 if n > 0 {
955 r.offset += int64(n)
956 max := maxLineLength
957 if r.strict || Pedantic {
958 max = 1000
959 }
960
961 for _, c := range buf[:n] {
962 if r.prevlf {
963 r.p.RawLineCount++
964 }
965 if err == nil || err == io.EOF {
966 if c == '\n' && !r.prevcr {
967 err = errBareLF
968 } else if c != '\n' && r.prevcr && (r.strict || Pedantic) {
969 err = errBareCR
970 }
971 }
972 r.prevlf = c == '\n'
973 r.prevcr = c == '\r'
974 r.linelength++
975 if c == '\n' {
976 r.linelength = 0
977 } else if r.linelength > max && err == nil {
978 err = errLineTooLong
979 }
980 }
981 }
982 if err == io.EOF {
983 r.p.EndOffset = r.offset
984 }
985 return n, err
986}
987
988var crlf = []byte("\r\n")
989
990// boundReader is a reader that stops at a closing multipart boundary.
991// boundReader ensures lines end with crlf through its use of bufAt.
992type boundReader struct {
993 p *Part
994 b *bufAt
995 buf []byte // Data from previous line, to be served first.
996 nbuf int // Number of valid bytes in buf.
997 crlf []byte // Possible crlf, to be returned if we do not yet encounter a boundary.
998 prevlf bool // If last char returned was a newline. For counting lines.
999}
1000
1001func (b *boundReader) Read(buf []byte) (count int, rerr error) {
1002 origBuf := buf
1003 defer func() {
1004 if count > 0 {
1005 for _, c := range origBuf[:count] {
1006 if b.prevlf {
1007 b.p.RawLineCount++
1008 }
1009 b.prevlf = c == '\n'
1010 }
1011 }
1012 }()
1013
1014 for {
1015 // Read data from earlier line.
1016 if b.nbuf > 0 {
1017 n := b.nbuf
1018 if n > len(buf) {
1019 n = len(buf)
1020 }
1021 copy(buf, b.buf[:n])
1022 copy(b.buf, b.buf[n:])
1023 buf = buf[n:]
1024 b.nbuf -= n
1025 count += n
1026 if b.nbuf > 0 {
1027 break
1028 }
1029 }
1030
1031 // Look at next line. If it is a boundary, we are done and won't serve the crlf from the last line.
1032 line, _, err := b.b.PeekLine(false)
1033 if match, _ := checkBound(line, b.p.parent.bound); match {
1034 b.p.EndOffset = b.b.offset - int64(len(b.crlf))
1035 if b.p.parent.lastBoundOffset == b.p.BoundaryOffset {
1036 b.p.parent.nextBoundOffset = b.b.offset
1037 } else if enforceSequential {
1038 panic("access not sequential")
1039 }
1040 return count, io.EOF
1041 }
1042 if err == io.EOF {
1043 err = errMissingClosingBoundary
1044 }
1045 if err != nil && err != io.EOF {
1046 return count, err
1047 }
1048 if len(b.crlf) > 0 {
1049 n := len(b.crlf)
1050 if n > len(buf) {
1051 n = len(buf)
1052 }
1053 copy(buf, b.crlf[:n])
1054 count += n
1055 buf = buf[n:]
1056 b.crlf = b.crlf[n:]
1057 }
1058 if len(buf) == 0 {
1059 break
1060 }
1061 line, _, err = b.b.ReadLine(true)
1062 if err != nil {
1063 // Could be an unexpected end of the part.
1064 return 0, err
1065 }
1066 b.crlf = crlf // crlf will be read next time, but not if a boundary follows.
1067 n := len(line) - 2
1068 line = line[:n]
1069 if n > len(buf) {
1070 n = len(buf)
1071 }
1072 copy(buf, line[:n])
1073 count += n
1074 buf = buf[n:]
1075 line = line[n:]
1076 if len(line) > 0 {
1077 if b.buf == nil {
1078 b.buf = make([]byte, b.b.maxLineLength())
1079 }
1080 copy(b.buf, line)
1081 b.nbuf = len(line)
1082 }
1083 }
1084 return count, nil
1085}
1086
1087func checkBound(line, bound []byte) (bool, bool) {
1088 if !bytes.HasPrefix(line, bound) {
1089 return false, false
1090 }
1091 line = line[len(bound):]
1092 if bytes.HasPrefix(line, []byte("--")) {
1093 return true, true
1094 }
1095 if len(line) == 0 {
1096 return true, false
1097 }
1098 c := line[0]
1099 switch c {
1100 case ' ', '\t', '\r', '\n':
1101 return true, false
1102 }
1103 return false, false
1104}
1105