1package message
2
3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").
4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...
5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.
6// todo: handle comments in headers?
7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.
8// todo: can use a cleanup
9
10import (
11 "bufio"
12 "bytes"
13 "encoding/base64"
14 "errors"
15 "fmt"
16 "io"
17 "log/slog"
18 "mime"
19 "mime/quotedprintable"
20 "net/mail"
21 "net/textproto"
22 "strings"
23 "time"
24 "unicode"
25
26 "golang.org/x/text/encoding/ianaindex"
27
28 "github.com/mjl-/mox/mlog"
29 "github.com/mjl-/mox/smtp"
30 "slices"
31)
32
33// Pedantic enables stricter parsing.
34var Pedantic bool
35
36var (
37 ErrBadContentType = errors.New("bad content-type")
38 ErrHeader = errors.New("bad message header")
39)
40
41var (
42 errNotMultipart = errors.New("not a multipart message")
43 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")
44 errLineTooLong = errors.New("line too long")
45 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")
46 errMissingClosingBoundary = errors.New("eof without closing boundary")
47 errBareLF = errors.New("invalid bare line feed")
48 errBareCR = errors.New("invalid bare carriage return")
49 errUnexpectedEOF = errors.New("unexpected eof")
50)
51
52// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.
53var enforceSequential bool
54
55// Part represents a whole mail message, or a part of a multipart message. It
56// is designed to handle IMAP requirements efficiently.
57type Part struct {
58 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.
59 HeaderOffset int64 // Offset in message file where header starts.
60 BodyOffset int64 // Offset in message file where body starts.
61 EndOffset int64 // Where body of part ends. Set when part is fully read.
62 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.
63 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.
64
65 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.
66 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".
67 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.
68 ContentID *string `json:",omitempty"`
69 ContentDescription *string `json:",omitempty"`
70 ContentTransferEncoding *string `json:",omitempty"` // In upper case.
71 ContentDisposition *string `json:",omitempty"`
72 ContentMD5 *string `json:",omitempty"`
73 ContentLanguage *string `json:",omitempty"`
74 ContentLocation *string `json:",omitempty"`
75 Envelope *Envelope `json:",omitempty"` // Email message headers. Not for non-message parts.
76
77 Parts []Part // Parts if this is a multipart.
78
79 // Only for message/rfc822 and message/global. This part may have a buffer as
80 // backing io.ReaderAt, because a message/global can have a non-identity
81 // content-transfer-encoding. This part has a nil parent.
82 Message *Part
83
84 r io.ReaderAt
85 header textproto.MIMEHeader // Parsed header.
86 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.
87 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.
88 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.
89 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.
90 strict bool // If set, valid crlf line endings are verified when reading body.
91}
92
93// todo: have all Content* fields in Part?
94// todo: make Address contain a type Localpart and dns.Domain?
95// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.
96
97// Envelope holds the basic/common message headers as used in IMAP4.
98type Envelope struct {
99 Date time.Time
100 Subject string // Q/B-word-decoded.
101 From []Address
102 Sender []Address
103 ReplyTo []Address
104 To []Address
105 CC []Address
106 BCC []Address
107 InReplyTo string // From In-Reply-To header, includes <>.
108 MessageID string // From Message-Id header, includes <>.
109}
110
111// Address as used in From and To headers.
112type Address struct {
113 Name string // Free-form name for display in mail applications.
114 User string // Localpart, encoded as string. Must be parsed before using as Localpart.
115 Host string // Domain in ASCII.
116}
117
118// Parse reads the headers of the mail message and returns a part.
119// A part provides access to decoded and raw contents of a message and its multiple parts.
120//
121// If strict is set, fewer attempts are made to continue parsing when errors are
122// encountered, such as with invalid content-type headers or bare carriage returns.
123func Parse(elog *slog.Logger, strict bool, r io.ReaderAt) (Part, error) {
124 log := mlog.New("message", elog)
125 return newPart(log, strict, r, 0, nil)
126}
127
128// EnsurePart parses a part as with Parse, but ensures a usable part is always
129// returned, even if error is non-nil. If a parse error occurs, the message is
130// returned as application/octet-stream, and headers can still be read if they
131// were valid.
132//
133// If strict is set, fewer attempts are made to continue parsing when errors are
134// encountered, such as with invalid content-type headers or bare carriage returns.
135func EnsurePart(elog *slog.Logger, strict bool, r io.ReaderAt, size int64) (Part, error) {
136 log := mlog.New("message", elog)
137 p, err := Parse(log.Logger, strict, r)
138 if err == nil {
139 err = p.Walk(log.Logger, nil)
140 }
141 if err != nil {
142 np, err2 := fallbackPart(p, r, size)
143 if err2 != nil {
144 err = err2
145 }
146 p = np
147 }
148 return p, err
149}
150
151func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {
152 np := Part{
153 HeaderOffset: p.HeaderOffset,
154 BodyOffset: p.BodyOffset,
155 EndOffset: size,
156 MediaType: "APPLICATION",
157 MediaSubType: "OCTET-STREAM",
158 ContentTypeParams: p.ContentTypeParams,
159 ContentID: p.ContentID,
160 ContentDescription: p.ContentDescription,
161 ContentTransferEncoding: p.ContentTransferEncoding,
162 ContentDisposition: p.ContentDisposition,
163 ContentMD5: p.ContentMD5,
164 ContentLanguage: p.ContentLanguage,
165 ContentLocation: p.ContentLocation,
166 Envelope: p.Envelope,
167 // We don't keep:
168 // - BoundaryOffset: irrelevant for top-level message.
169 // - RawLineCount and DecodedSize: set below.
170 // - Parts: we are not treating this as a multipart message.
171 }
172 np.SetReaderAt(r)
173 // By reading body, the number of lines and decoded size will be set.
174 _, err := io.Copy(io.Discard, np.Reader())
175 return np, err
176}
177
178// SetReaderAt sets r as reader for this part and all its sub parts, recursively.
179// No reader is set for any Message subpart, see SetMessageReaderAt.
180func (p *Part) SetReaderAt(r io.ReaderAt) {
181 if r == nil {
182 panic("nil reader")
183 }
184 p.r = r
185 for i := range p.Parts {
186 pp := &p.Parts[i]
187 pp.SetReaderAt(r)
188 }
189}
190
191// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.
192func (p *Part) SetMessageReaderAt() error {
193 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.
194 buf, err := io.ReadAll(p.Reader())
195 if err != nil {
196 return err
197 }
198 p.Message.SetReaderAt(bytes.NewReader(buf))
199 return nil
200}
201
202// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.
203func (p *Part) Walk(elog *slog.Logger, parent *Part) error {
204 log := mlog.New("message", elog)
205
206 if len(p.bound) == 0 {
207 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {
208 // todo: don't read whole submessage in memory...
209 buf, err := io.ReadAll(p.Reader())
210 if err != nil {
211 return err
212 }
213 br := bytes.NewReader(buf)
214 mp, err := Parse(log.Logger, p.strict, br)
215 if err != nil {
216 return fmt.Errorf("parsing embedded message: %w", err)
217 }
218 if err := mp.Walk(log.Logger, nil); err != nil {
219 // If this is a DSN and we are not in pedantic mode, accept unexpected end of
220 // message. This is quite common because MTA's sometimes just truncate the original
221 // message in a place that makes the message invalid.
222 if errors.Is(err, errUnexpectedEOF) && !Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {
223 mp, err = fallbackPart(mp, br, int64(len(buf)))
224 if err != nil {
225 return fmt.Errorf("parsing invalid embedded message: %w", err)
226 }
227 } else {
228 return fmt.Errorf("parsing parts of embedded message: %w", err)
229 }
230 }
231 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.
232 p.Message = &mp
233 return nil
234 }
235 _, err := io.Copy(io.Discard, p.Reader())
236 return err
237 }
238
239 for {
240 pp, err := p.ParseNextPart(log.Logger)
241 if err == io.EOF {
242 return nil
243 }
244 if err != nil {
245 return err
246 }
247 if err := pp.Walk(log.Logger, p); err != nil {
248 return err
249 }
250 }
251}
252
253// String returns a debugging representation of the part.
254func (p *Part) String() string {
255 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)
256}
257
258// newPart parses a new part, which can be the top-level message.
259// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.
260// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.
261func newPart(log mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
262 if r == nil {
263 panic("nil reader")
264 }
265 p = Part{
266 BoundaryOffset: -1,
267 EndOffset: -1,
268 r: r,
269 parent: parent,
270 strict: strict,
271 }
272
273 b := &bufAt{strict: strict, r: r, offset: offset}
274
275 if parent != nil {
276 p.BoundaryOffset = offset
277 if line, _, err := b.ReadLine(true); err != nil {
278 return p, err
279 } else if match, finish := checkBound(line, parent.bound); !match {
280 return p, fmt.Errorf("missing bound")
281 } else if finish {
282 return p, fmt.Errorf("new part for closing boundary")
283 }
284 }
285
286 // Collect header.
287 p.HeaderOffset = b.offset
288 p.BodyOffset = b.offset
289 hb := &bytes.Buffer{}
290 for {
291 line, _, err := b.ReadLine(true)
292 if err == io.EOF {
293 // No body is valid.
294 break
295 }
296 if err != nil {
297 return p, fmt.Errorf("reading header line: %w", err)
298 }
299 hb.Write(line)
300 if len(line) == 2 {
301 break // crlf
302 }
303 }
304 p.BodyOffset = b.offset
305
306 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.
307 if p.HeaderOffset == p.BodyOffset {
308 p.header = textproto.MIMEHeader{}
309 } else {
310 h, err := parseHeader(hb)
311 if err != nil {
312 return p, fmt.Errorf("parsing header: %w", err)
313 }
314 p.header = h
315 }
316
317 ct := p.header.Get("Content-Type")
318 mt, params, err := mime.ParseMediaType(ct)
319 if err != nil && ct != "" {
320 if Pedantic || strict {
321 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)
322 }
323
324 // Try parsing just a content-type, ignoring parameters.
325 // ../rfc/2045:628
326 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])
327 t := strings.SplitN(ct, "/", 2)
328 isToken := func(s string) bool {
329 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663
330 for _, c := range s {
331 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {
332 return false
333 }
334 }
335 return len(s) > 0
336 }
337 // We cannot recover content-type of multipart, we won't have a boundary.
338 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {
339 p.MediaType = strings.ToUpper(t[0])
340 p.MediaSubType = strings.ToUpper(t[1])
341 } else {
342 p.MediaType = "APPLICATION"
343 p.MediaSubType = "OCTET-STREAM"
344 }
345 log.Debugx("malformed content-type, attempting to recover and continuing", err,
346 slog.String("contenttype", p.header.Get("Content-Type")),
347 slog.String("mediatype", p.MediaType),
348 slog.String("mediasubtype", p.MediaSubType))
349 } else if mt != "" {
350 t := strings.SplitN(strings.ToUpper(mt), "/", 2)
351 if len(t) != 2 {
352 if Pedantic || strict {
353 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)
354 }
355 log.Debug("malformed media-type, ignoring and continuing", slog.String("type", mt))
356 p.MediaType = "APPLICATION"
357 p.MediaSubType = "OCTET-STREAM"
358 } else {
359 p.MediaType = t[0]
360 p.MediaSubType = t[1]
361 p.ContentTypeParams = params
362 }
363 }
364
365 p.ContentID = p.headerGet("Content-Id")
366 p.ContentDescription = p.headerGet("Content-Description")
367 cte := p.headerGet("Content-Transfer-Encoding")
368 if cte != nil {
369 s := strings.ToUpper(*cte)
370 cte = &s
371 }
372 p.ContentTransferEncoding = cte
373 p.ContentDisposition = p.headerGet("Content-Disposition")
374 p.ContentMD5 = p.headerGet("Content-Md5")
375 p.ContentLanguage = p.headerGet("Content-Language")
376 p.ContentLocation = p.headerGet("Content-Location")
377
378 if parent == nil {
379 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))
380 if err != nil {
381 return p, err
382 }
383 }
384
385 if p.MediaType == "MULTIPART" {
386 s := params["boundary"]
387 if s == "" {
388 return p, errMissingBoundaryParam
389 }
390 p.bound = append([]byte("--"), s...)
391
392 // Discard preamble, before first boundary.
393 for {
394 line, _, err := b.PeekLine(true)
395 if err != nil {
396 return p, fmt.Errorf("parsing line for part preamble: %w", err)
397 }
398 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103
399 // Well, for compatibility, we require whitespace after the boundary. Because some
400 // software use the same boundary but with text appended for sub parts.
401 if match, finish := checkBound(line, p.bound); match {
402 if finish {
403 return p, errFirstBoundCloses
404 }
405 break
406 }
407 b.ReadLine(true)
408 }
409 p.nextBoundOffset = b.offset
410 p.lastBoundOffset = b.offset
411 }
412
413 return p, nil
414}
415
416// Header returns the parsed header of this part.
417//
418// Returns a ErrHeader for messages with invalid header syntax.
419func (p *Part) Header() (textproto.MIMEHeader, error) {
420 if p.header != nil {
421 return p.header, nil
422 }
423 if p.HeaderOffset == p.BodyOffset {
424 p.header = textproto.MIMEHeader{}
425 return p.header, nil
426 }
427 h, err := parseHeader(p.HeaderReader())
428 p.header = h
429 return h, err
430}
431
432func (p *Part) headerGet(k string) *string {
433 l := p.header.Values(k)
434 if len(l) == 0 {
435 return nil
436 }
437 s := l[0]
438 return &s
439}
440
441// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.
442func (p *Part) HeaderReader() io.Reader {
443 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)
444}
445
446// parse a header, only call this on non-empty input (even though that is a valid header).
447func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {
448 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the
449 // first handles email messages properly, while the second only works for HTTP
450 // headers.
451 var zero textproto.MIMEHeader
452
453 // We read the header and add the optional \r\n header/body separator. If the \r\n
454 // is missing, parsing with Go <1.21 results in an EOF error.
455 // todo: directly parse from reader r when Go 1.20 is no longer supported.
456 buf, err := io.ReadAll(r)
457 if err != nil {
458 return zero, err
459 }
460 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {
461 buf = append(buf, "\r\n"...)
462 }
463 msg, err := mail.ReadMessage(bytes.NewReader(buf))
464 if err != nil {
465 // Recognize parsing errors from net/mail.ReadMessage.
466 // todo: replace with own message parsing code that returns proper error types.
467 errstr := err.Error()
468 if strings.HasPrefix(errstr, "malformed initial line:") || strings.HasPrefix(errstr, "malformed header line:") {
469 err = fmt.Errorf("%w: %v", ErrHeader, err)
470 }
471 return zero, err
472 }
473 return textproto.MIMEHeader(msg.Header), nil
474}
475
476var wordDecoder = mime.WordDecoder{
477 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {
478 switch strings.ToLower(charset) {
479 case "", "us-ascii", "utf-8":
480 return r, nil
481 }
482 enc, _ := ianaindex.MIME.Encoding(charset)
483 if enc == nil {
484 enc, _ = ianaindex.IANA.Encoding(charset)
485 }
486 if enc == nil {
487 return r, fmt.Errorf("unknown charset %q", charset)
488 }
489 return enc.NewDecoder().Reader(r), nil
490 },
491}
492
493func parseEnvelope(log mlog.Log, h mail.Header) (*Envelope, error) {
494 date, _ := h.Date()
495
496 // We currently marshal this field to JSON. But JSON cannot represent all
497 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme
498 // years, but we can readjust timezones.
499 // todo: remove this once we no longer store using json.
500 _, offset := date.Zone()
501 if date.Year() > 9999 {
502 date = time.Time{}
503 } else if offset <= -24*3600 || offset >= 24*3600 {
504 date = time.Unix(date.Unix(), 0).UTC()
505 }
506
507 subject := h.Get("Subject")
508 if s, err := wordDecoder.DecodeHeader(subject); err == nil {
509 subject = s
510 }
511
512 env := &Envelope{
513 date,
514 subject,
515 parseAddressList(log, h, "from"),
516 parseAddressList(log, h, "sender"),
517 parseAddressList(log, h, "reply-to"),
518 parseAddressList(log, h, "to"),
519 parseAddressList(log, h, "cc"),
520 parseAddressList(log, h, "bcc"),
521 h.Get("In-Reply-To"),
522 h.Get("Message-Id"),
523 }
524 return env, nil
525}
526
527func parseAddressList(log mlog.Log, h mail.Header, k string) []Address {
528 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382
529 v := h.Get(k)
530 if v == "" {
531 return nil
532 }
533 parser := mail.AddressParser{WordDecoder: &wordDecoder}
534 l, err := parser.ParseList(v)
535 if err != nil {
536 return nil
537 }
538 var r []Address
539 for _, a := range l {
540 // todo: parse more fully according to ../rfc/5322:959
541 var user, host string
542 addr, err := smtp.ParseNetMailAddress(a.Address)
543 if err != nil {
544 log.Infox("parsing address (continuing)", err, slog.Any("netmailaddress", a.Address))
545 } else {
546 user = addr.Localpart.String()
547 host = addr.Domain.ASCII
548 }
549 r = append(r, Address{a.Name, user, host})
550 }
551 return r
552}
553
554// ParseNextPart parses the next (sub)part of this multipart message.
555// ParseNextPart returns io.EOF and a nil part when there are no more parts.
556// Only used for initial parsing of message. Once parsed, use p.Parts.
557func (p *Part) ParseNextPart(elog *slog.Logger) (*Part, error) {
558 log := mlog.New("message", elog)
559
560 if len(p.bound) == 0 {
561 return nil, errNotMultipart
562 }
563 if p.nextBoundOffset == -1 {
564 if enforceSequential {
565 panic("access not sequential")
566 }
567 // Set nextBoundOffset by fully reading the last part.
568 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)
569 if err != nil {
570 return nil, err
571 }
572 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {
573 return nil, err
574 }
575 if p.nextBoundOffset == -1 {
576 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")
577 }
578 }
579 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}
580 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.
581 line, crlf, err := b.ReadLine(false)
582 if err != nil {
583 return nil, err
584 }
585 if match, finish := checkBound(line, p.bound); !match {
586 return nil, fmt.Errorf("expected bound, got %q", line)
587 } else if finish {
588 // Read any trailing data.
589 if p.parent != nil {
590 for {
591 line, _, err := b.PeekLine(false)
592 if err != nil {
593 break
594 }
595 if match, _ := checkBound(line, p.parent.bound); match {
596 break
597 }
598 b.ReadLine(false)
599 }
600 if p.parent.lastBoundOffset == p.BoundaryOffset {
601 p.parent.nextBoundOffset = b.offset
602 }
603 }
604 p.EndOffset = b.offset
605 return nil, io.EOF
606 } else if !crlf {
607 return nil, fmt.Errorf("non-finishing bound without crlf: %w", errUnexpectedEOF)
608 }
609 boundOffset := p.nextBoundOffset
610 p.lastBoundOffset = boundOffset
611 p.nextBoundOffset = -1
612 np, err := newPart(log, p.strict, p.r, boundOffset, p)
613 if err != nil {
614 return nil, err
615 }
616 p.Parts = append(p.Parts, np)
617 return &p.Parts[len(p.Parts)-1], nil
618}
619
620// IsDSN returns whether the MIME structure of the part is a DSN.
621func (p *Part) IsDSN() bool {
622 return p.MediaType == "MULTIPART" &&
623 p.MediaSubType == "REPORT" &&
624 len(p.Parts) >= 2 &&
625 p.Parts[1].MediaType == "MESSAGE" &&
626 (p.Parts[1].MediaSubType == "DELIVERY-STATUS" || p.Parts[1].MediaSubType == "GLOBAL-DELIVERY-STATUS")
627}
628
629func hasNonASCII(r io.Reader) (bool, error) {
630 br := bufio.NewReader(r)
631 for {
632 b, err := br.ReadByte()
633 if err == io.EOF {
634 break
635 } else if err != nil {
636 return false, err
637 }
638 if b > unicode.MaxASCII {
639 return true, nil
640 }
641 }
642 return false, nil
643}
644
645// NeedsSMTPUTF8 returns whether the part needs the SMTPUTF8 extension to be
646// transported, due to non-ascii in message headers.
647func (p *Part) NeedsSMTPUTF8() (bool, error) {
648 if has, err := hasNonASCII(p.HeaderReader()); err != nil {
649 return false, fmt.Errorf("reading header: %w", err)
650 } else if has {
651 return true, nil
652 }
653 for _, pp := range p.Parts {
654 if has, err := pp.NeedsSMTPUTF8(); err != nil || has {
655 return has, err
656 }
657 }
658 return false, nil
659}
660
661var ErrParamEncoding = errors.New("bad header parameter encoding")
662
663// DispositionFilename tries to parse the disposition header and the "filename"
664// parameter. If the filename parameter is absent or can't be parsed, the "name"
665// parameter from the Content-Type header is used for the filename. The returned
666// filename is decoded according to RFC 2231 or RFC 2047. This is a best-effort
667// attempt to find a filename for a part. If no Content-Disposition header, or
668// filename was found, empty values without error are returned.
669//
670// If the returned error is an ErrParamEncoding, it can be treated as a diagnostic
671// and a filename may still be returned.
672func (p *Part) DispositionFilename() (disposition string, filename string, err error) {
673 cd := p.ContentDisposition
674 var disp string
675 var params map[string]string
676 if cd != nil && *cd != "" {
677 disp, params, err = mime.ParseMediaType(*cd)
678 }
679 if err != nil {
680 return "", "", fmt.Errorf("%w: parsing disposition header: %v", ErrParamEncoding, err)
681 }
682 filename, err = tryDecodeParam(params["filename"])
683 if filename == "" {
684 s, err2 := tryDecodeParam(p.ContentTypeParams["name"])
685 filename = s
686 if err == nil {
687 err = err2
688 }
689 }
690 return disp, filename, err
691}
692
693// Attempt q/b-word-decode name, coming from Content-Type "name" field or
694// Content-Disposition "filename" field.
695//
696// RFC 2231 specifies an encoding for non-ascii values in mime header parameters. But
697// it appears common practice to instead just q/b-word encode the values.
698// Thunderbird and gmail.com do this for the Content-Type "name" parameter.
699// gmail.com also does that for the Content-Disposition "filename" parameter, where
700// Thunderbird uses the RFC 2231-defined encoding. Go's mime.ParseMediaType parses
701// the mechanism specified in RFC 2231 only. The value for "name" we get here would
702// already be decoded properly for standards-compliant headers, like
703// "filename*0*=UTF-8”%...; filename*1*=%.... We'll look for Q/B-word encoding
704// markers ("=?"-prefix or "?="-suffix) and try to decode if present. This would
705// only cause trouble for filenames having this prefix/suffix.
706func tryDecodeParam(name string) (string, error) {
707 if name == "" || !strings.HasPrefix(name, "=?") && !strings.HasSuffix(name, "?=") {
708 return name, nil
709 }
710 // todo: find where this is allowed. it seems quite common. perhaps we should remove the pedantic check?
711 if Pedantic {
712 return name, fmt.Errorf("%w: attachment contains rfc2047 q/b-word-encoded mime parameter instead of rfc2231-encoded", ErrParamEncoding)
713 }
714 s, err := wordDecoder.DecodeHeader(name)
715 if err != nil {
716 return name, fmt.Errorf("%w: q/b-word decoding mime parameter: %v", ErrParamEncoding, err)
717 }
718 return s, nil
719}
720
721// Reader returns a reader for the decoded body content.
722func (p *Part) Reader() io.Reader {
723 return p.bodyReader(p.RawReader())
724}
725
726// ReaderUTF8OrBinary returns a reader for the decoded body content, transformed to
727// utf-8 for known mime/iana encodings (only if they aren't us-ascii or utf-8
728// already). For unknown or missing character sets/encodings, the original reader
729// is returned.
730func (p *Part) ReaderUTF8OrBinary() io.Reader {
731 return DecodeReader(p.ContentTypeParams["charset"], p.Reader())
732}
733
734func (p *Part) bodyReader(r io.Reader) io.Reader {
735 r = newDecoder(p.ContentTransferEncoding, r)
736 if p.MediaType == "TEXT" {
737 return &textReader{p, bufio.NewReader(r), 0, false}
738 }
739 return &countReader{p, r, 0}
740}
741
742// countReader is an io.Reader that passes Reads to the underlying reader.
743// when eof is read, it sets p.DecodedSize to the number of bytes returned.
744type countReader struct {
745 p *Part
746 r io.Reader
747 count int64
748}
749
750func (cr *countReader) Read(buf []byte) (int, error) {
751 n, err := cr.r.Read(buf)
752 if n >= 0 {
753 cr.count += int64(n)
754 }
755 if err == io.EOF {
756 cr.p.DecodedSize = cr.count
757 }
758 return n, err
759}
760
761// textReader is an io.Reader that ensures all lines return end in CRLF.
762// when eof is read from the underlying reader, it sets p.DecodedSize.
763type textReader struct {
764 p *Part
765 r *bufio.Reader
766 count int64
767 prevcr bool // If previous byte returned was a CR.
768}
769
770func (tr *textReader) Read(buf []byte) (int, error) {
771 o := 0
772 for o < len(buf) {
773 c, err := tr.r.ReadByte()
774 if err != nil {
775 tr.count += int64(o)
776 tr.p.DecodedSize = tr.count
777 return o, err
778 }
779 if c == '\n' && !tr.prevcr {
780 if err := tr.r.UnreadByte(); err != nil {
781 return o, err
782 }
783 buf[o] = '\r'
784 o++
785 tr.prevcr = true
786 continue
787 }
788 buf[o] = c
789 tr.prevcr = c == '\r'
790 o++
791 }
792 tr.count += int64(o)
793 return o, nil
794}
795
796func newDecoder(cte *string, r io.Reader) io.Reader {
797 var s string
798 if cte != nil {
799 s = *cte
800 }
801 // ../rfc/2045:775
802 switch s {
803 case "BASE64":
804 return base64.NewDecoder(base64.StdEncoding, r)
805 case "QUOTED-PRINTABLE":
806 return quotedprintable.NewReader(r)
807 }
808 return r
809}
810
811// RawReader returns a reader for the raw, undecoded body content. E.g. with
812// quoted-printable or base64 content intact.
813// Fully reading a part helps its parent part find its next part efficiently.
814func (p *Part) RawReader() io.Reader {
815 if p.r == nil {
816 panic("missing reader")
817 }
818 if p.EndOffset >= 0 {
819 return &crlfReader{strict: p.strict, r: io.NewSectionReader(p.r, p.BodyOffset, p.EndOffset-p.BodyOffset)}
820 }
821 p.RawLineCount = 0
822 if p.parent == nil {
823 return &offsetReader{p, p.BodyOffset, p.strict, true, false, 0}
824 }
825 return &boundReader{p: p, b: &bufAt{strict: p.strict, r: p.r, offset: p.BodyOffset}, prevlf: true}
826}
827
828// crlfReader verifies there are no bare newlines and optionally no bare carriage returns.
829type crlfReader struct {
830 r io.Reader
831 strict bool
832 prevcr bool
833}
834
835func (r *crlfReader) Read(buf []byte) (int, error) {
836 n, err := r.r.Read(buf)
837 if err == nil || err == io.EOF {
838 for _, b := range buf[:n] {
839 if b == '\n' && !r.prevcr {
840 err = errBareLF
841 break
842 } else if b != '\n' && r.prevcr && (r.strict || Pedantic) {
843 err = errBareCR
844 break
845 }
846 r.prevcr = b == '\r'
847 }
848 }
849 return n, err
850}
851
852// bufAt is a buffered reader on an underlying ReaderAt.
853// bufAt verifies that lines end with crlf.
854type bufAt struct {
855 offset int64 // Offset in r currently consumed, i.e. not including any buffered data.
856
857 strict bool
858 r io.ReaderAt
859 buf []byte // Buffered data.
860 nbuf int // Valid bytes in buf.
861 scratch []byte
862}
863
864// Messages should not have lines longer than 78+2 bytes, and must not have
865// lines longer than 998+2 bytes. But in practice they have longer lines. We
866// have a higher limit, but for when parsing with strict we check for the 1000
867// bytes limit.
868// ../rfc/5321:3512
869const maxLineLength = 8 * 1024
870
871func (b *bufAt) maxLineLength() int {
872 if b.strict || Pedantic {
873 return 1000
874 }
875 return maxLineLength
876}
877
878// ensure makes sure b.nbuf is up to maxLineLength, unless eof is encountered.
879func (b *bufAt) ensure() error {
880 if slices.Contains(b.buf[:b.nbuf], '\n') {
881 return nil
882 }
883 if b.scratch == nil {
884 b.scratch = make([]byte, b.maxLineLength())
885 }
886 if b.buf == nil {
887 b.buf = make([]byte, b.maxLineLength())
888 }
889 for b.nbuf < b.maxLineLength() {
890 n, err := b.r.ReadAt(b.buf[b.nbuf:], b.offset+int64(b.nbuf))
891 if n > 0 {
892 b.nbuf += n
893 }
894 if err != nil && err != io.EOF || err == io.EOF && b.nbuf+n == 0 {
895 return err
896 }
897 if n == 0 || err == io.EOF {
898 break
899 }
900 }
901 return nil
902}
903
904// ReadLine reads a line until \r\n is found, returning the line including \r\n.
905// If not found, or a bare \n is encountered, or a bare \r is enountered in pedantic mode, ReadLine returns an error.
906func (b *bufAt) ReadLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
907 return b.line(true, requirecrlf)
908}
909
910func (b *bufAt) PeekLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
911 return b.line(false, requirecrlf)
912}
913
914func (b *bufAt) line(consume, requirecrlf bool) (buf []byte, crlf bool, err error) {
915 if err := b.ensure(); err != nil {
916 return nil, false, err
917 }
918 for i, c := range b.buf[:b.nbuf] {
919 if c == '\n' {
920 // Should have seen a \r, which should have been handled below.
921 return nil, false, errBareLF
922 }
923 if c != '\r' {
924 continue
925 }
926 i++
927 if i >= b.nbuf || b.buf[i] != '\n' {
928 if b.strict || Pedantic {
929 return nil, false, errBareCR
930 }
931 continue
932 }
933 b.scratch = b.scratch[:i+1]
934 copy(b.scratch, b.buf[:i+1])
935 if consume {
936 copy(b.buf, b.buf[i+1:])
937 b.offset += int64(i + 1)
938 b.nbuf -= i + 1
939 }
940 return b.scratch, true, nil
941 }
942 if b.nbuf >= b.maxLineLength() {
943 return nil, false, errLineTooLong
944 }
945 if requirecrlf {
946 return nil, false, errUnexpectedEOF
947 }
948 b.scratch = b.scratch[:b.nbuf]
949 copy(b.scratch, b.buf[:b.nbuf])
950 if consume {
951 b.offset += int64(b.nbuf)
952 b.nbuf = 0
953 }
954 return b.scratch, false, nil
955}
956
957// PeekByte returns the next unread byte, or an error.
958func (b *bufAt) PeekByte() (byte, error) {
959 if err := b.ensure(); err != nil {
960 return 0, err
961 }
962 if b.nbuf == 0 {
963 return 0, io.EOF
964 }
965 return b.buf[0], nil
966}
967
968// offsetReader reads from p.r starting from offset, and RawLineCount on p.
969// offsetReader validates lines end with \r\n.
970type offsetReader struct {
971 p *Part
972 offset int64
973 strict bool
974 prevlf bool
975 prevcr bool
976 linelength int
977}
978
979func (r *offsetReader) Read(buf []byte) (int, error) {
980 n, err := r.p.r.ReadAt(buf, r.offset)
981 if n > 0 {
982 r.offset += int64(n)
983 max := maxLineLength
984 if r.strict || Pedantic {
985 max = 1000
986 }
987
988 for _, c := range buf[:n] {
989 if r.prevlf {
990 r.p.RawLineCount++
991 }
992 if err == nil || err == io.EOF {
993 if c == '\n' && !r.prevcr {
994 err = errBareLF
995 } else if c != '\n' && r.prevcr && (r.strict || Pedantic) {
996 err = errBareCR
997 }
998 }
999 r.prevlf = c == '\n'
1000 r.prevcr = c == '\r'
1001 r.linelength++
1002 if c == '\n' {
1003 r.linelength = 0
1004 } else if r.linelength > max && err == nil {
1005 err = errLineTooLong
1006 }
1007 }
1008 }
1009 if err == io.EOF {
1010 r.p.EndOffset = r.offset
1011 }
1012 return n, err
1013}
1014
1015var crlf = []byte("\r\n")
1016
1017// boundReader is a reader that stops at a closing multipart boundary.
1018// boundReader ensures lines end with crlf through its use of bufAt.
1019type boundReader struct {
1020 p *Part
1021 b *bufAt
1022 buf []byte // Data from previous line, to be served first.
1023 nbuf int // Number of valid bytes in buf.
1024 crlf []byte // Possible crlf, to be returned if we do not yet encounter a boundary.
1025 prevlf bool // If last char returned was a newline. For counting lines.
1026}
1027
1028func (b *boundReader) Read(buf []byte) (count int, rerr error) {
1029 origBuf := buf
1030 defer func() {
1031 if count > 0 {
1032 for _, c := range origBuf[:count] {
1033 if b.prevlf {
1034 b.p.RawLineCount++
1035 }
1036 b.prevlf = c == '\n'
1037 }
1038 }
1039 }()
1040
1041 for {
1042 // Read data from earlier line.
1043 if b.nbuf > 0 {
1044 n := min(b.nbuf, len(buf))
1045 copy(buf, b.buf[:n])
1046 copy(b.buf, b.buf[n:])
1047 buf = buf[n:]
1048 b.nbuf -= n
1049 count += n
1050 if b.nbuf > 0 {
1051 break
1052 }
1053 }
1054
1055 // Look at next line. If it is a boundary, we are done and won't serve the crlf from the last line.
1056 line, _, err := b.b.PeekLine(false)
1057 if match, _ := checkBound(line, b.p.parent.bound); match {
1058 b.p.EndOffset = b.b.offset - int64(len(b.crlf))
1059 if b.p.parent.lastBoundOffset == b.p.BoundaryOffset {
1060 b.p.parent.nextBoundOffset = b.b.offset
1061 } else if enforceSequential {
1062 panic("access not sequential")
1063 }
1064 return count, io.EOF
1065 }
1066 if err == io.EOF {
1067 err = errMissingClosingBoundary
1068 }
1069 if err != nil && err != io.EOF {
1070 return count, err
1071 }
1072 if len(b.crlf) > 0 {
1073 n := min(len(b.crlf), len(buf))
1074 copy(buf, b.crlf[:n])
1075 count += n
1076 buf = buf[n:]
1077 b.crlf = b.crlf[n:]
1078 }
1079 if len(buf) == 0 {
1080 break
1081 }
1082 line, _, err = b.b.ReadLine(true)
1083 if err != nil {
1084 // Could be an unexpected end of the part.
1085 return 0, err
1086 }
1087 b.crlf = crlf // crlf will be read next time, but not if a boundary follows.
1088 n := len(line) - 2
1089 line = line[:n]
1090 if n > len(buf) {
1091 n = len(buf)
1092 }
1093 copy(buf, line[:n])
1094 count += n
1095 buf = buf[n:]
1096 line = line[n:]
1097 if len(line) > 0 {
1098 if b.buf == nil {
1099 b.buf = make([]byte, b.b.maxLineLength())
1100 }
1101 copy(b.buf, line)
1102 b.nbuf = len(line)
1103 }
1104 }
1105 return count, nil
1106}
1107
1108func checkBound(line, bound []byte) (bool, bool) {
1109 if !bytes.HasPrefix(line, bound) {
1110 return false, false
1111 }
1112 line = line[len(bound):]
1113 if bytes.HasPrefix(line, []byte("--")) {
1114 return true, true
1115 }
1116 if len(line) == 0 {
1117 return true, false
1118 }
1119 c := line[0]
1120 switch c {
1121 case ' ', '\t', '\r', '\n':
1122 return true, false
1123 }
1124 return false, false
1125}
1126