1package message
2
3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").
4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...
5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.
6// todo: handle comments in headers?
7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.
8// todo: can use a cleanup
9
10import (
11 "bufio"
12 "bytes"
13 "encoding/base64"
14 "errors"
15 "fmt"
16 "io"
17 "log/slog"
18 "mime"
19 "mime/quotedprintable"
20 "net/mail"
21 "net/textproto"
22 "strings"
23 "time"
24
25 "golang.org/x/text/encoding/ianaindex"
26
27 "github.com/mjl-/mox/mlog"
28 "github.com/mjl-/mox/smtp"
29)
30
31// Pedantic enables stricter parsing.
32var Pedantic bool
33
34var (
35 ErrBadContentType = errors.New("bad content-type")
36)
37
38var (
39 errNotMultipart = errors.New("not a multipart message")
40 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")
41 errLineTooLong = errors.New("line too long")
42 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")
43 errMissingClosingBoundary = errors.New("eof without closing boundary")
44 errBareLF = errors.New("invalid bare line feed")
45 errBareCR = errors.New("invalid bare carriage return")
46 errUnexpectedEOF = errors.New("unexpected eof")
47)
48
49// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.
50var enforceSequential bool
51
52// Part represents a whole mail message, or a part of a multipart message. It
53// is designed to handle IMAP requirements efficiently.
54type Part struct {
55 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.
56 HeaderOffset int64 // Offset in message file where header starts.
57 BodyOffset int64 // Offset in message file where body starts.
58 EndOffset int64 // Where body of part ends. Set when part is fully read.
59 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.
60 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.
61
62 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.
63 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".
64 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.
65 ContentID string
66 ContentDescription string
67 ContentTransferEncoding string // In upper case.
68 Envelope *Envelope // Email message headers. Not for non-message parts.
69
70 Parts []Part // Parts if this is a multipart.
71
72 // Only for message/rfc822 and message/global. This part may have a buffer as
73 // backing io.ReaderAt, because a message/global can have a non-identity
74 // content-transfer-encoding. This part has a nil parent.
75 Message *Part
76
77 r io.ReaderAt
78 header textproto.MIMEHeader // Parsed header.
79 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.
80 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.
81 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.
82 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.
83 strict bool // If set, valid crlf line endings are verified when reading body.
84}
85
86// todo: have all Content* fields in Part?
87// todo: make Address contain a type Localpart and dns.Domain?
88// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.
89
90// Envelope holds the basic/common message headers as used in IMAP4.
91type Envelope struct {
92 Date time.Time
93 Subject string // Q/B-word-decoded.
94 From []Address
95 Sender []Address
96 ReplyTo []Address
97 To []Address
98 CC []Address
99 BCC []Address
100 InReplyTo string // From In-Reply-To header, includes <>.
101 MessageID string // From Message-Id header, includes <>.
102}
103
104// Address as used in From and To headers.
105type Address struct {
106 Name string // Free-form name for display in mail applications.
107 User string // Localpart, encoded as string. Must be parsed before using as Localpart.
108 Host string // Domain in ASCII.
109}
110
111// Parse reads the headers of the mail message and returns a part.
112// A part provides access to decoded and raw contents of a message and its multiple parts.
113//
114// If strict is set, fewer attempts are made to continue parsing when errors are
115// encountered, such as with invalid content-type headers or bare carriage returns.
116func Parse(elog *slog.Logger, strict bool, r io.ReaderAt) (Part, error) {
117 log := mlog.New("message", elog)
118 return newPart(log, strict, r, 0, nil)
119}
120
121// EnsurePart parses a part as with Parse, but ensures a usable part is always
122// returned, even if error is non-nil. If a parse error occurs, the message is
123// returned as application/octet-stream, and headers can still be read if they
124// were valid.
125//
126// If strict is set, fewer attempts are made to continue parsing when errors are
127// encountered, such as with invalid content-type headers or bare carriage returns.
128func EnsurePart(elog *slog.Logger, strict bool, r io.ReaderAt, size int64) (Part, error) {
129 log := mlog.New("message", elog)
130 p, err := Parse(log.Logger, strict, r)
131 if err == nil {
132 err = p.Walk(log.Logger, nil)
133 }
134 if err != nil {
135 np, err2 := fallbackPart(p, r, size)
136 if err2 != nil {
137 err = err2
138 }
139 p = np
140 }
141 return p, err
142}
143
144func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {
145 np := Part{
146 HeaderOffset: p.HeaderOffset,
147 BodyOffset: p.BodyOffset,
148 EndOffset: size,
149 MediaType: "APPLICATION",
150 MediaSubType: "OCTET-STREAM",
151 ContentTypeParams: p.ContentTypeParams,
152 ContentID: p.ContentID,
153 ContentDescription: p.ContentDescription,
154 ContentTransferEncoding: p.ContentTransferEncoding,
155 Envelope: p.Envelope,
156 // We don't keep:
157 // - BoundaryOffset: irrelevant for top-level message.
158 // - RawLineCount and DecodedSize: set below.
159 // - Parts: we are not treating this as a multipart message.
160 }
161 np.SetReaderAt(r)
162 // By reading body, the number of lines and decoded size will be set.
163 _, err := io.Copy(io.Discard, np.Reader())
164 return np, err
165}
166
167// SetReaderAt sets r as reader for this part and all its sub parts, recursively.
168// No reader is set for any Message subpart, see SetMessageReaderAt.
169func (p *Part) SetReaderAt(r io.ReaderAt) {
170 if r == nil {
171 panic("nil reader")
172 }
173 p.r = r
174 for i := range p.Parts {
175 pp := &p.Parts[i]
176 pp.SetReaderAt(r)
177 }
178}
179
180// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.
181func (p *Part) SetMessageReaderAt() error {
182 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.
183 buf, err := io.ReadAll(p.Reader())
184 if err != nil {
185 return err
186 }
187 p.Message.SetReaderAt(bytes.NewReader(buf))
188 return nil
189}
190
191// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.
192func (p *Part) Walk(elog *slog.Logger, parent *Part) error {
193 log := mlog.New("message", elog)
194
195 if len(p.bound) == 0 {
196 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {
197 // todo: don't read whole submessage in memory...
198 buf, err := io.ReadAll(p.Reader())
199 if err != nil {
200 return err
201 }
202 br := bytes.NewReader(buf)
203 mp, err := Parse(log.Logger, p.strict, br)
204 if err != nil {
205 return fmt.Errorf("parsing embedded message: %w", err)
206 }
207 if err := mp.Walk(log.Logger, nil); err != nil {
208 // If this is a DSN and we are not in pedantic mode, accept unexpected end of
209 // message. This is quite common because MTA's sometimes just truncate the original
210 // message in a place that makes the message invalid.
211 if errors.Is(err, errUnexpectedEOF) && !Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {
212 mp, err = fallbackPart(mp, br, int64(len(buf)))
213 if err != nil {
214 return fmt.Errorf("parsing invalid embedded message: %w", err)
215 }
216 } else {
217 return fmt.Errorf("parsing parts of embedded message: %w", err)
218 }
219 }
220 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.
221 p.Message = &mp
222 return nil
223 }
224 _, err := io.Copy(io.Discard, p.Reader())
225 return err
226 }
227
228 for {
229 pp, err := p.ParseNextPart(log.Logger)
230 if err == io.EOF {
231 return nil
232 }
233 if err != nil {
234 return err
235 }
236 if err := pp.Walk(log.Logger, p); err != nil {
237 return err
238 }
239 }
240}
241
242// String returns a debugging representation of the part.
243func (p *Part) String() string {
244 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)
245}
246
247// newPart parses a new part, which can be the top-level message.
248// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.
249// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.
250func newPart(log mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
251 if r == nil {
252 panic("nil reader")
253 }
254 p = Part{
255 BoundaryOffset: -1,
256 EndOffset: -1,
257 r: r,
258 parent: parent,
259 strict: strict,
260 }
261
262 b := &bufAt{strict: strict, r: r, offset: offset}
263
264 if parent != nil {
265 p.BoundaryOffset = offset
266 if line, _, err := b.ReadLine(true); err != nil {
267 return p, err
268 } else if match, finish := checkBound(line, parent.bound); !match {
269 return p, fmt.Errorf("missing bound")
270 } else if finish {
271 return p, fmt.Errorf("new part for closing boundary")
272 }
273 }
274
275 // Collect header.
276 p.HeaderOffset = b.offset
277 p.BodyOffset = b.offset
278 hb := &bytes.Buffer{}
279 for {
280 line, _, err := b.ReadLine(true)
281 if err == io.EOF {
282 // No body is valid.
283 break
284 }
285 if err != nil {
286 return p, fmt.Errorf("reading header line: %w", err)
287 }
288 hb.Write(line)
289 if len(line) == 2 {
290 break // crlf
291 }
292 }
293 p.BodyOffset = b.offset
294
295 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.
296 if p.HeaderOffset == p.BodyOffset {
297 p.header = textproto.MIMEHeader{}
298 } else {
299 h, err := parseHeader(hb)
300 if err != nil {
301 return p, fmt.Errorf("parsing header: %w", err)
302 }
303 p.header = h
304 }
305
306 ct := p.header.Get("Content-Type")
307 mt, params, err := mime.ParseMediaType(ct)
308 if err != nil && ct != "" {
309 if Pedantic || strict {
310 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)
311 }
312
313 // Try parsing just a content-type, ignoring parameters.
314 // ../rfc/2045:628
315 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])
316 t := strings.SplitN(ct, "/", 2)
317 isToken := func(s string) bool {
318 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663
319 for _, c := range s {
320 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {
321 return false
322 }
323 }
324 return len(s) > 0
325 }
326 // We cannot recover content-type of multipart, we won't have a boundary.
327 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {
328 p.MediaType = strings.ToUpper(t[0])
329 p.MediaSubType = strings.ToUpper(t[1])
330 } else {
331 p.MediaType = "APPLICATION"
332 p.MediaSubType = "OCTET-STREAM"
333 }
334 log.Debugx("malformed content-type, attempting to recover and continuing", err,
335 slog.String("contenttype", p.header.Get("Content-Type")),
336 slog.String("mediatype", p.MediaType),
337 slog.String("mediasubtype", p.MediaSubType))
338 } else if mt != "" {
339 t := strings.SplitN(strings.ToUpper(mt), "/", 2)
340 if len(t) != 2 {
341 if Pedantic || strict {
342 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)
343 }
344 log.Debug("malformed media-type, ignoring and continuing", slog.String("type", mt))
345 p.MediaType = "APPLICATION"
346 p.MediaSubType = "OCTET-STREAM"
347 } else {
348 p.MediaType = t[0]
349 p.MediaSubType = t[1]
350 p.ContentTypeParams = params
351 }
352 }
353
354 p.ContentID = p.header.Get("Content-Id")
355 p.ContentDescription = p.header.Get("Content-Description")
356 p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))
357
358 if parent == nil {
359 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))
360 if err != nil {
361 return p, err
362 }
363 }
364
365 if p.MediaType == "MULTIPART" {
366 s := params["boundary"]
367 if s == "" {
368 return p, errMissingBoundaryParam
369 }
370 p.bound = append([]byte("--"), s...)
371
372 // Discard preamble, before first boundary.
373 for {
374 line, _, err := b.PeekLine(true)
375 if err != nil {
376 return p, fmt.Errorf("parsing line for part preamble: %w", err)
377 }
378 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103
379 // Well, for compatibility, we require whitespace after the boundary. Because some
380 // software use the same boundary but with text appended for sub parts.
381 if match, finish := checkBound(line, p.bound); match {
382 if finish {
383 return p, errFirstBoundCloses
384 }
385 break
386 }
387 b.ReadLine(true)
388 }
389 p.nextBoundOffset = b.offset
390 p.lastBoundOffset = b.offset
391 }
392
393 return p, nil
394}
395
396// Header returns the parsed header of this part.
397func (p *Part) Header() (textproto.MIMEHeader, error) {
398 if p.header != nil {
399 return p.header, nil
400 }
401 if p.HeaderOffset == p.BodyOffset {
402 p.header = textproto.MIMEHeader{}
403 return p.header, nil
404 }
405 h, err := parseHeader(p.HeaderReader())
406 p.header = h
407 return h, err
408}
409
410// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.
411func (p *Part) HeaderReader() io.Reader {
412 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)
413}
414
415// parse a header, only call this on non-empty input (even though that is a valid header).
416func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {
417 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the
418 // first handles email messages properly, while the second only works for HTTP
419 // headers.
420 var zero textproto.MIMEHeader
421
422 // We read the header and add the optional \r\n header/body separator. If the \r\n
423 // is missing, parsing with Go <1.21 results in an EOF error.
424 // todo: directly parse from reader r when Go 1.20 is no longer supported.
425 buf, err := io.ReadAll(r)
426 if err != nil {
427 return zero, err
428 }
429 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {
430 buf = append(buf, "\r\n"...)
431 }
432 msg, err := mail.ReadMessage(bytes.NewReader(buf))
433 if err != nil {
434 return zero, err
435 }
436 return textproto.MIMEHeader(msg.Header), nil
437}
438
439var wordDecoder = mime.WordDecoder{
440 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {
441 switch strings.ToLower(charset) {
442 case "", "us-ascii", "utf-8":
443 return r, nil
444 }
445 enc, _ := ianaindex.MIME.Encoding(charset)
446 if enc == nil {
447 enc, _ = ianaindex.IANA.Encoding(charset)
448 }
449 if enc == nil {
450 return r, fmt.Errorf("unknown charset %q", charset)
451 }
452 return enc.NewDecoder().Reader(r), nil
453 },
454}
455
456func parseEnvelope(log mlog.Log, h mail.Header) (*Envelope, error) {
457 date, _ := h.Date()
458
459 // We currently marshal this field to JSON. But JSON cannot represent all
460 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme
461 // years, but we can readjust timezones.
462 // todo: remove this once we no longer store using json.
463 _, offset := date.Zone()
464 if date.Year() > 9999 {
465 date = time.Time{}
466 } else if offset <= -24*3600 || offset >= 24*3600 {
467 date = time.Unix(date.Unix(), 0).UTC()
468 }
469
470 subject := h.Get("Subject")
471 if s, err := wordDecoder.DecodeHeader(subject); err == nil {
472 subject = s
473 }
474
475 env := &Envelope{
476 date,
477 subject,
478 parseAddressList(log, h, "from"),
479 parseAddressList(log, h, "sender"),
480 parseAddressList(log, h, "reply-to"),
481 parseAddressList(log, h, "to"),
482 parseAddressList(log, h, "cc"),
483 parseAddressList(log, h, "bcc"),
484 h.Get("In-Reply-To"),
485 h.Get("Message-Id"),
486 }
487 return env, nil
488}
489
490func parseAddressList(log mlog.Log, h mail.Header, k string) []Address {
491 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382
492 v := h.Get(k)
493 if v == "" {
494 return nil
495 }
496 parser := mail.AddressParser{WordDecoder: &wordDecoder}
497 l, err := parser.ParseList(v)
498 if err != nil {
499 return nil
500 }
501 var r []Address
502 for _, a := range l {
503 // todo: parse more fully according to ../rfc/5322:959
504 var user, host string
505 addr, err := smtp.ParseNetMailAddress(a.Address)
506 if err != nil {
507 log.Infox("parsing address (continuing)", err, slog.Any("netmailaddress", a.Address))
508 } else {
509 user = addr.Localpart.String()
510 host = addr.Domain.ASCII
511 }
512 r = append(r, Address{a.Name, user, host})
513 }
514 return r
515}
516
517// ParseNextPart parses the next (sub)part of this multipart message.
518// ParseNextPart returns io.EOF and a nil part when there are no more parts.
519// Only used for initial parsing of message. Once parsed, use p.Parts.
520func (p *Part) ParseNextPart(elog *slog.Logger) (*Part, error) {
521 log := mlog.New("message", elog)
522
523 if len(p.bound) == 0 {
524 return nil, errNotMultipart
525 }
526 if p.nextBoundOffset == -1 {
527 if enforceSequential {
528 panic("access not sequential")
529 }
530 // Set nextBoundOffset by fully reading the last part.
531 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)
532 if err != nil {
533 return nil, err
534 }
535 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {
536 return nil, err
537 }
538 if p.nextBoundOffset == -1 {
539 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")
540 }
541 }
542 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}
543 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.
544 line, crlf, err := b.ReadLine(false)
545 if err != nil {
546 return nil, err
547 }
548 if match, finish := checkBound(line, p.bound); !match {
549 return nil, fmt.Errorf("expected bound, got %q", line)
550 } else if finish {
551 // Read any trailing data.
552 if p.parent != nil {
553 for {
554 line, _, err := b.PeekLine(false)
555 if err != nil {
556 break
557 }
558 if match, _ := checkBound(line, p.parent.bound); match {
559 break
560 }
561 b.ReadLine(false)
562 }
563 if p.parent.lastBoundOffset == p.BoundaryOffset {
564 p.parent.nextBoundOffset = b.offset
565 }
566 }
567 p.EndOffset = b.offset
568 return nil, io.EOF
569 } else if !crlf {
570 return nil, fmt.Errorf("non-finishing bound without crlf: %w", errUnexpectedEOF)
571 }
572 boundOffset := p.nextBoundOffset
573 p.lastBoundOffset = boundOffset
574 p.nextBoundOffset = -1
575 np, err := newPart(log, p.strict, p.r, boundOffset, p)
576 if err != nil {
577 return nil, err
578 }
579 p.Parts = append(p.Parts, np)
580 return &p.Parts[len(p.Parts)-1], nil
581}
582
583// IsDSN returns whether the MIME structure of the part is a DSN.
584func (p *Part) IsDSN() bool {
585 return p.MediaType == "MULTIPART" &&
586 p.MediaSubType == "REPORT" &&
587 len(p.Parts) >= 2 &&
588 p.Parts[1].MediaType == "MESSAGE" &&
589 (p.Parts[1].MediaSubType == "DELIVERY-STATUS" || p.Parts[1].MediaSubType == "GLOBAL-DELIVERY-STATUS")
590}
591
592// Reader returns a reader for the decoded body content.
593func (p *Part) Reader() io.Reader {
594 return p.bodyReader(p.RawReader())
595}
596
597// ReaderUTF8OrBinary returns a reader for the decoded body content, transformed to
598// utf-8 for known mime/iana encodings (only if they aren't us-ascii or utf-8
599// already). For unknown or missing character sets/encodings, the original reader
600// is returned.
601func (p *Part) ReaderUTF8OrBinary() io.Reader {
602 return DecodeReader(p.ContentTypeParams["charset"], p.Reader())
603}
604
605func (p *Part) bodyReader(r io.Reader) io.Reader {
606 r = newDecoder(p.ContentTransferEncoding, r)
607 if p.MediaType == "TEXT" {
608 return &textReader{p, bufio.NewReader(r), 0, false}
609 }
610 return &countReader{p, r, 0}
611}
612
613// countReader is an io.Reader that passes Reads to the underlying reader.
614// when eof is read, it sets p.DecodedSize to the number of bytes returned.
615type countReader struct {
616 p *Part
617 r io.Reader
618 count int64
619}
620
621func (cr *countReader) Read(buf []byte) (int, error) {
622 n, err := cr.r.Read(buf)
623 if n >= 0 {
624 cr.count += int64(n)
625 }
626 if err == io.EOF {
627 cr.p.DecodedSize = cr.count
628 }
629 return n, err
630}
631
632// textReader is an io.Reader that ensures all lines return end in CRLF.
633// when eof is read from the underlying reader, it sets p.DecodedSize.
634type textReader struct {
635 p *Part
636 r *bufio.Reader
637 count int64
638 prevcr bool // If previous byte returned was a CR.
639}
640
641func (tr *textReader) Read(buf []byte) (int, error) {
642 o := 0
643 for o < len(buf) {
644 c, err := tr.r.ReadByte()
645 if err != nil {
646 tr.count += int64(o)
647 tr.p.DecodedSize = tr.count
648 return o, err
649 }
650 if c == '\n' && !tr.prevcr {
651 buf[o] = '\r'
652 o++
653 tr.prevcr = true
654 tr.r.UnreadByte()
655 continue
656 }
657 buf[o] = c
658 tr.prevcr = c == '\r'
659 o++
660 }
661 tr.count += int64(o)
662 return o, nil
663}
664
665func newDecoder(cte string, r io.Reader) io.Reader {
666 // ../rfc/2045:775
667 switch cte {
668 case "BASE64":
669 return base64.NewDecoder(base64.StdEncoding, r)
670 case "QUOTED-PRINTABLE":
671 return quotedprintable.NewReader(r)
672 }
673 return r
674}
675
676// RawReader returns a reader for the raw, undecoded body content. E.g. with
677// quoted-printable or base64 content intact.
678// Fully reading a part helps its parent part find its next part efficiently.
679func (p *Part) RawReader() io.Reader {
680 if p.r == nil {
681 panic("missing reader")
682 }
683 if p.EndOffset >= 0 {
684 return &crlfReader{strict: p.strict, r: io.NewSectionReader(p.r, p.BodyOffset, p.EndOffset-p.BodyOffset)}
685 }
686 p.RawLineCount = 0
687 if p.parent == nil {
688 return &offsetReader{p, p.BodyOffset, p.strict, true, false, 0}
689 }
690 return &boundReader{p: p, b: &bufAt{strict: p.strict, r: p.r, offset: p.BodyOffset}, prevlf: true}
691}
692
693// crlfReader verifies there are no bare newlines and optionally no bare carriage returns.
694type crlfReader struct {
695 r io.Reader
696 strict bool
697 prevcr bool
698}
699
700func (r *crlfReader) Read(buf []byte) (int, error) {
701 n, err := r.r.Read(buf)
702 if err == nil || err == io.EOF {
703 for _, b := range buf[:n] {
704 if b == '\n' && !r.prevcr {
705 err = errBareLF
706 break
707 } else if b != '\n' && r.prevcr && (r.strict || Pedantic) {
708 err = errBareCR
709 break
710 }
711 r.prevcr = b == '\r'
712 }
713 }
714 return n, err
715}
716
717// bufAt is a buffered reader on an underlying ReaderAt.
718// bufAt verifies that lines end with crlf.
719type bufAt struct {
720 offset int64 // Offset in r currently consumed, i.e. not including any buffered data.
721
722 strict bool
723 r io.ReaderAt
724 buf []byte // Buffered data.
725 nbuf int // Valid bytes in buf.
726 scratch []byte
727}
728
729// Messages should not have lines longer than 78+2 bytes, and must not have
730// lines longer than 998+2 bytes. But in practice they have longer lines. We
731// have a higher limit, but for when parsing with strict we check for the 1000
732// bytes limit.
733// ../rfc/5321:3512
734const maxLineLength = 8 * 1024
735
736func (b *bufAt) maxLineLength() int {
737 if b.strict || Pedantic {
738 return 1000
739 }
740 return maxLineLength
741}
742
743// ensure makes sure b.nbuf is up to maxLineLength, unless eof is encountered.
744func (b *bufAt) ensure() error {
745 for _, c := range b.buf[:b.nbuf] {
746 if c == '\n' {
747 return nil
748 }
749 }
750 if b.scratch == nil {
751 b.scratch = make([]byte, b.maxLineLength())
752 }
753 if b.buf == nil {
754 b.buf = make([]byte, b.maxLineLength())
755 }
756 for b.nbuf < b.maxLineLength() {
757 n, err := b.r.ReadAt(b.buf[b.nbuf:], b.offset+int64(b.nbuf))
758 if n > 0 {
759 b.nbuf += n
760 }
761 if err != nil && err != io.EOF || err == io.EOF && b.nbuf+n == 0 {
762 return err
763 }
764 if n == 0 || err == io.EOF {
765 break
766 }
767 }
768 return nil
769}
770
771// ReadLine reads a line until \r\n is found, returning the line including \r\n.
772// If not found, or a bare \n is encountered, or a bare \r is enountered in pedantic mode, ReadLine returns an error.
773func (b *bufAt) ReadLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
774 return b.line(true, requirecrlf)
775}
776
777func (b *bufAt) PeekLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
778 return b.line(false, requirecrlf)
779}
780
781func (b *bufAt) line(consume, requirecrlf bool) (buf []byte, crlf bool, err error) {
782 if err := b.ensure(); err != nil {
783 return nil, false, err
784 }
785 for i, c := range b.buf[:b.nbuf] {
786 if c == '\n' {
787 // Should have seen a \r, which should have been handled below.
788 return nil, false, errBareLF
789 }
790 if c != '\r' {
791 continue
792 }
793 i++
794 if i >= b.nbuf || b.buf[i] != '\n' {
795 if b.strict || Pedantic {
796 return nil, false, errBareCR
797 }
798 continue
799 }
800 b.scratch = b.scratch[:i+1]
801 copy(b.scratch, b.buf[:i+1])
802 if consume {
803 copy(b.buf, b.buf[i+1:])
804 b.offset += int64(i + 1)
805 b.nbuf -= i + 1
806 }
807 return b.scratch, true, nil
808 }
809 if b.nbuf >= b.maxLineLength() {
810 return nil, false, errLineTooLong
811 }
812 if requirecrlf {
813 return nil, false, errUnexpectedEOF
814 }
815 b.scratch = b.scratch[:b.nbuf]
816 copy(b.scratch, b.buf[:b.nbuf])
817 if consume {
818 b.offset += int64(b.nbuf)
819 b.nbuf = 0
820 }
821 return b.scratch, false, nil
822}
823
824// PeekByte returns the next unread byte, or an error.
825func (b *bufAt) PeekByte() (byte, error) {
826 if err := b.ensure(); err != nil {
827 return 0, err
828 }
829 if b.nbuf == 0 {
830 return 0, io.EOF
831 }
832 return b.buf[0], nil
833}
834
835// offsetReader reads from p.r starting from offset, and RawLineCount on p.
836// offsetReader validates lines end with \r\n.
837type offsetReader struct {
838 p *Part
839 offset int64
840 strict bool
841 prevlf bool
842 prevcr bool
843 linelength int
844}
845
846func (r *offsetReader) Read(buf []byte) (int, error) {
847 n, err := r.p.r.ReadAt(buf, r.offset)
848 if n > 0 {
849 r.offset += int64(n)
850 max := maxLineLength
851 if r.strict || Pedantic {
852 max = 1000
853 }
854
855 for _, c := range buf[:n] {
856 if r.prevlf {
857 r.p.RawLineCount++
858 }
859 if err == nil || err == io.EOF {
860 if c == '\n' && !r.prevcr {
861 err = errBareLF
862 } else if c != '\n' && r.prevcr && (r.strict || Pedantic) {
863 err = errBareCR
864 }
865 }
866 r.prevlf = c == '\n'
867 r.prevcr = c == '\r'
868 r.linelength++
869 if c == '\n' {
870 r.linelength = 0
871 } else if r.linelength > max && err == nil {
872 err = errLineTooLong
873 }
874 }
875 }
876 if err == io.EOF {
877 r.p.EndOffset = r.offset
878 }
879 return n, err
880}
881
882var crlf = []byte("\r\n")
883
884// boundReader is a reader that stops at a closing multipart boundary.
885// boundReader ensures lines end with crlf through its use of bufAt.
886type boundReader struct {
887 p *Part
888 b *bufAt
889 buf []byte // Data from previous line, to be served first.
890 nbuf int // Number of valid bytes in buf.
891 crlf []byte // Possible crlf, to be returned if we do not yet encounter a boundary.
892 prevlf bool // If last char returned was a newline. For counting lines.
893}
894
895func (b *boundReader) Read(buf []byte) (count int, rerr error) {
896 origBuf := buf
897 defer func() {
898 if count > 0 {
899 for _, c := range origBuf[:count] {
900 if b.prevlf {
901 b.p.RawLineCount++
902 }
903 b.prevlf = c == '\n'
904 }
905 }
906 }()
907
908 for {
909 // Read data from earlier line.
910 if b.nbuf > 0 {
911 n := b.nbuf
912 if n > len(buf) {
913 n = len(buf)
914 }
915 copy(buf, b.buf[:n])
916 copy(b.buf, b.buf[n:])
917 buf = buf[n:]
918 b.nbuf -= n
919 count += n
920 if b.nbuf > 0 {
921 break
922 }
923 }
924
925 // Look at next line. If it is a boundary, we are done and won't serve the crlf from the last line.
926 line, _, err := b.b.PeekLine(false)
927 if match, _ := checkBound(line, b.p.parent.bound); match {
928 b.p.EndOffset = b.b.offset - int64(len(b.crlf))
929 if b.p.parent.lastBoundOffset == b.p.BoundaryOffset {
930 b.p.parent.nextBoundOffset = b.b.offset
931 } else if enforceSequential {
932 panic("access not sequential")
933 }
934 return count, io.EOF
935 }
936 if err == io.EOF {
937 err = errMissingClosingBoundary
938 }
939 if err != nil && err != io.EOF {
940 return count, err
941 }
942 if len(b.crlf) > 0 {
943 n := len(b.crlf)
944 if n > len(buf) {
945 n = len(buf)
946 }
947 copy(buf, b.crlf[:n])
948 count += n
949 buf = buf[n:]
950 b.crlf = b.crlf[n:]
951 }
952 if len(buf) == 0 {
953 break
954 }
955 line, _, err = b.b.ReadLine(true)
956 if err != nil {
957 // Could be an unexpected end of the part.
958 return 0, err
959 }
960 b.crlf = crlf // crlf will be read next time, but not if a boundary follows.
961 n := len(line) - 2
962 line = line[:n]
963 if n > len(buf) {
964 n = len(buf)
965 }
966 copy(buf, line[:n])
967 count += n
968 buf = buf[n:]
969 line = line[n:]
970 if len(line) > 0 {
971 if b.buf == nil {
972 b.buf = make([]byte, b.b.maxLineLength())
973 }
974 copy(b.buf, line)
975 b.nbuf = len(line)
976 }
977 }
978 return count, nil
979}
980
981func checkBound(line, bound []byte) (bool, bool) {
982 if !bytes.HasPrefix(line, bound) {
983 return false, false
984 }
985 line = line[len(bound):]
986 if bytes.HasPrefix(line, []byte("--")) {
987 return true, true
988 }
989 if len(line) == 0 {
990 return true, false
991 }
992 c := line[0]
993 switch c {
994 case ' ', '\t', '\r', '\n':
995 return true, false
996 }
997 return false, false
998}
999