1package message
2
3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").
4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...
5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.
6// todo: handle comments in headers?
7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.
8// todo: can use a cleanup
9
10import (
11 "bufio"
12 "bytes"
13 "encoding/base64"
14 "errors"
15 "fmt"
16 "io"
17 "log/slog"
18 "mime"
19 "mime/quotedprintable"
20 "net/mail"
21 "net/textproto"
22 "strings"
23 "time"
24
25 "golang.org/x/text/encoding/ianaindex"
26
27 "github.com/mjl-/mox/mlog"
28 "github.com/mjl-/mox/smtp"
29)
30
31// Pedantic enables stricter parsing.
32var Pedantic bool
33
34var (
35 ErrBadContentType = errors.New("bad content-type")
36 ErrHeader = errors.New("bad message header")
37)
38
39var (
40 errNotMultipart = errors.New("not a multipart message")
41 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")
42 errLineTooLong = errors.New("line too long")
43 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")
44 errMissingClosingBoundary = errors.New("eof without closing boundary")
45 errBareLF = errors.New("invalid bare line feed")
46 errBareCR = errors.New("invalid bare carriage return")
47 errUnexpectedEOF = errors.New("unexpected eof")
48)
49
50// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.
51var enforceSequential bool
52
53// Part represents a whole mail message, or a part of a multipart message. It
54// is designed to handle IMAP requirements efficiently.
55type Part struct {
56 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.
57 HeaderOffset int64 // Offset in message file where header starts.
58 BodyOffset int64 // Offset in message file where body starts.
59 EndOffset int64 // Where body of part ends. Set when part is fully read.
60 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.
61 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.
62
63 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.
64 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".
65 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.
66 ContentID string
67 ContentDescription string
68 ContentTransferEncoding string // In upper case.
69 Envelope *Envelope // Email message headers. Not for non-message parts.
70
71 Parts []Part // Parts if this is a multipart.
72
73 // Only for message/rfc822 and message/global. This part may have a buffer as
74 // backing io.ReaderAt, because a message/global can have a non-identity
75 // content-transfer-encoding. This part has a nil parent.
76 Message *Part
77
78 r io.ReaderAt
79 header textproto.MIMEHeader // Parsed header.
80 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.
81 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.
82 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.
83 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.
84 strict bool // If set, valid crlf line endings are verified when reading body.
85}
86
87// todo: have all Content* fields in Part?
88// todo: make Address contain a type Localpart and dns.Domain?
89// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.
90
91// Envelope holds the basic/common message headers as used in IMAP4.
92type Envelope struct {
93 Date time.Time
94 Subject string // Q/B-word-decoded.
95 From []Address
96 Sender []Address
97 ReplyTo []Address
98 To []Address
99 CC []Address
100 BCC []Address
101 InReplyTo string // From In-Reply-To header, includes <>.
102 MessageID string // From Message-Id header, includes <>.
103}
104
105// Address as used in From and To headers.
106type Address struct {
107 Name string // Free-form name for display in mail applications.
108 User string // Localpart, encoded as string. Must be parsed before using as Localpart.
109 Host string // Domain in ASCII.
110}
111
112// Parse reads the headers of the mail message and returns a part.
113// A part provides access to decoded and raw contents of a message and its multiple parts.
114//
115// If strict is set, fewer attempts are made to continue parsing when errors are
116// encountered, such as with invalid content-type headers or bare carriage returns.
117func Parse(elog *slog.Logger, strict bool, r io.ReaderAt) (Part, error) {
118 log := mlog.New("message", elog)
119 return newPart(log, strict, r, 0, nil)
120}
121
122// EnsurePart parses a part as with Parse, but ensures a usable part is always
123// returned, even if error is non-nil. If a parse error occurs, the message is
124// returned as application/octet-stream, and headers can still be read if they
125// were valid.
126//
127// If strict is set, fewer attempts are made to continue parsing when errors are
128// encountered, such as with invalid content-type headers or bare carriage returns.
129func EnsurePart(elog *slog.Logger, strict bool, r io.ReaderAt, size int64) (Part, error) {
130 log := mlog.New("message", elog)
131 p, err := Parse(log.Logger, strict, r)
132 if err == nil {
133 err = p.Walk(log.Logger, nil)
134 }
135 if err != nil {
136 np, err2 := fallbackPart(p, r, size)
137 if err2 != nil {
138 err = err2
139 }
140 p = np
141 }
142 return p, err
143}
144
145func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {
146 np := Part{
147 HeaderOffset: p.HeaderOffset,
148 BodyOffset: p.BodyOffset,
149 EndOffset: size,
150 MediaType: "APPLICATION",
151 MediaSubType: "OCTET-STREAM",
152 ContentTypeParams: p.ContentTypeParams,
153 ContentID: p.ContentID,
154 ContentDescription: p.ContentDescription,
155 ContentTransferEncoding: p.ContentTransferEncoding,
156 Envelope: p.Envelope,
157 // We don't keep:
158 // - BoundaryOffset: irrelevant for top-level message.
159 // - RawLineCount and DecodedSize: set below.
160 // - Parts: we are not treating this as a multipart message.
161 }
162 np.SetReaderAt(r)
163 // By reading body, the number of lines and decoded size will be set.
164 _, err := io.Copy(io.Discard, np.Reader())
165 return np, err
166}
167
168// SetReaderAt sets r as reader for this part and all its sub parts, recursively.
169// No reader is set for any Message subpart, see SetMessageReaderAt.
170func (p *Part) SetReaderAt(r io.ReaderAt) {
171 if r == nil {
172 panic("nil reader")
173 }
174 p.r = r
175 for i := range p.Parts {
176 pp := &p.Parts[i]
177 pp.SetReaderAt(r)
178 }
179}
180
181// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.
182func (p *Part) SetMessageReaderAt() error {
183 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.
184 buf, err := io.ReadAll(p.Reader())
185 if err != nil {
186 return err
187 }
188 p.Message.SetReaderAt(bytes.NewReader(buf))
189 return nil
190}
191
192// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.
193func (p *Part) Walk(elog *slog.Logger, parent *Part) error {
194 log := mlog.New("message", elog)
195
196 if len(p.bound) == 0 {
197 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {
198 // todo: don't read whole submessage in memory...
199 buf, err := io.ReadAll(p.Reader())
200 if err != nil {
201 return err
202 }
203 br := bytes.NewReader(buf)
204 mp, err := Parse(log.Logger, p.strict, br)
205 if err != nil {
206 return fmt.Errorf("parsing embedded message: %w", err)
207 }
208 if err := mp.Walk(log.Logger, nil); err != nil {
209 // If this is a DSN and we are not in pedantic mode, accept unexpected end of
210 // message. This is quite common because MTA's sometimes just truncate the original
211 // message in a place that makes the message invalid.
212 if errors.Is(err, errUnexpectedEOF) && !Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {
213 mp, err = fallbackPart(mp, br, int64(len(buf)))
214 if err != nil {
215 return fmt.Errorf("parsing invalid embedded message: %w", err)
216 }
217 } else {
218 return fmt.Errorf("parsing parts of embedded message: %w", err)
219 }
220 }
221 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.
222 p.Message = &mp
223 return nil
224 }
225 _, err := io.Copy(io.Discard, p.Reader())
226 return err
227 }
228
229 for {
230 pp, err := p.ParseNextPart(log.Logger)
231 if err == io.EOF {
232 return nil
233 }
234 if err != nil {
235 return err
236 }
237 if err := pp.Walk(log.Logger, p); err != nil {
238 return err
239 }
240 }
241}
242
243// String returns a debugging representation of the part.
244func (p *Part) String() string {
245 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)
246}
247
248// newPart parses a new part, which can be the top-level message.
249// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.
250// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.
251func newPart(log mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
252 if r == nil {
253 panic("nil reader")
254 }
255 p = Part{
256 BoundaryOffset: -1,
257 EndOffset: -1,
258 r: r,
259 parent: parent,
260 strict: strict,
261 }
262
263 b := &bufAt{strict: strict, r: r, offset: offset}
264
265 if parent != nil {
266 p.BoundaryOffset = offset
267 if line, _, err := b.ReadLine(true); err != nil {
268 return p, err
269 } else if match, finish := checkBound(line, parent.bound); !match {
270 return p, fmt.Errorf("missing bound")
271 } else if finish {
272 return p, fmt.Errorf("new part for closing boundary")
273 }
274 }
275
276 // Collect header.
277 p.HeaderOffset = b.offset
278 p.BodyOffset = b.offset
279 hb := &bytes.Buffer{}
280 for {
281 line, _, err := b.ReadLine(true)
282 if err == io.EOF {
283 // No body is valid.
284 break
285 }
286 if err != nil {
287 return p, fmt.Errorf("reading header line: %w", err)
288 }
289 hb.Write(line)
290 if len(line) == 2 {
291 break // crlf
292 }
293 }
294 p.BodyOffset = b.offset
295
296 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.
297 if p.HeaderOffset == p.BodyOffset {
298 p.header = textproto.MIMEHeader{}
299 } else {
300 h, err := parseHeader(hb)
301 if err != nil {
302 return p, fmt.Errorf("parsing header: %w", err)
303 }
304 p.header = h
305 }
306
307 ct := p.header.Get("Content-Type")
308 mt, params, err := mime.ParseMediaType(ct)
309 if err != nil && ct != "" {
310 if Pedantic || strict {
311 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)
312 }
313
314 // Try parsing just a content-type, ignoring parameters.
315 // ../rfc/2045:628
316 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])
317 t := strings.SplitN(ct, "/", 2)
318 isToken := func(s string) bool {
319 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663
320 for _, c := range s {
321 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {
322 return false
323 }
324 }
325 return len(s) > 0
326 }
327 // We cannot recover content-type of multipart, we won't have a boundary.
328 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {
329 p.MediaType = strings.ToUpper(t[0])
330 p.MediaSubType = strings.ToUpper(t[1])
331 } else {
332 p.MediaType = "APPLICATION"
333 p.MediaSubType = "OCTET-STREAM"
334 }
335 log.Debugx("malformed content-type, attempting to recover and continuing", err,
336 slog.String("contenttype", p.header.Get("Content-Type")),
337 slog.String("mediatype", p.MediaType),
338 slog.String("mediasubtype", p.MediaSubType))
339 } else if mt != "" {
340 t := strings.SplitN(strings.ToUpper(mt), "/", 2)
341 if len(t) != 2 {
342 if Pedantic || strict {
343 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)
344 }
345 log.Debug("malformed media-type, ignoring and continuing", slog.String("type", mt))
346 p.MediaType = "APPLICATION"
347 p.MediaSubType = "OCTET-STREAM"
348 } else {
349 p.MediaType = t[0]
350 p.MediaSubType = t[1]
351 p.ContentTypeParams = params
352 }
353 }
354
355 p.ContentID = p.header.Get("Content-Id")
356 p.ContentDescription = p.header.Get("Content-Description")
357 p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))
358
359 if parent == nil {
360 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))
361 if err != nil {
362 return p, err
363 }
364 }
365
366 if p.MediaType == "MULTIPART" {
367 s := params["boundary"]
368 if s == "" {
369 return p, errMissingBoundaryParam
370 }
371 p.bound = append([]byte("--"), s...)
372
373 // Discard preamble, before first boundary.
374 for {
375 line, _, err := b.PeekLine(true)
376 if err != nil {
377 return p, fmt.Errorf("parsing line for part preamble: %w", err)
378 }
379 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103
380 // Well, for compatibility, we require whitespace after the boundary. Because some
381 // software use the same boundary but with text appended for sub parts.
382 if match, finish := checkBound(line, p.bound); match {
383 if finish {
384 return p, errFirstBoundCloses
385 }
386 break
387 }
388 b.ReadLine(true)
389 }
390 p.nextBoundOffset = b.offset
391 p.lastBoundOffset = b.offset
392 }
393
394 return p, nil
395}
396
397// Header returns the parsed header of this part.
398//
399// Returns a ErrHeader for messages with invalid header syntax.
400func (p *Part) Header() (textproto.MIMEHeader, error) {
401 if p.header != nil {
402 return p.header, nil
403 }
404 if p.HeaderOffset == p.BodyOffset {
405 p.header = textproto.MIMEHeader{}
406 return p.header, nil
407 }
408 h, err := parseHeader(p.HeaderReader())
409 p.header = h
410 return h, err
411}
412
413// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.
414func (p *Part) HeaderReader() io.Reader {
415 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)
416}
417
418// parse a header, only call this on non-empty input (even though that is a valid header).
419func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {
420 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the
421 // first handles email messages properly, while the second only works for HTTP
422 // headers.
423 var zero textproto.MIMEHeader
424
425 // We read the header and add the optional \r\n header/body separator. If the \r\n
426 // is missing, parsing with Go <1.21 results in an EOF error.
427 // todo: directly parse from reader r when Go 1.20 is no longer supported.
428 buf, err := io.ReadAll(r)
429 if err != nil {
430 return zero, err
431 }
432 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {
433 buf = append(buf, "\r\n"...)
434 }
435 msg, err := mail.ReadMessage(bytes.NewReader(buf))
436 if err != nil {
437 // Recognize parsing errors from net/mail.ReadMessage.
438 // todo: replace with own message parsing code that returns proper error types.
439 errstr := err.Error()
440 if strings.HasPrefix(errstr, "malformed initial line:") || strings.HasPrefix(errstr, "malformed header line:") {
441 err = fmt.Errorf("%w: %v", ErrHeader, err)
442 }
443 return zero, err
444 }
445 return textproto.MIMEHeader(msg.Header), nil
446}
447
448var wordDecoder = mime.WordDecoder{
449 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {
450 switch strings.ToLower(charset) {
451 case "", "us-ascii", "utf-8":
452 return r, nil
453 }
454 enc, _ := ianaindex.MIME.Encoding(charset)
455 if enc == nil {
456 enc, _ = ianaindex.IANA.Encoding(charset)
457 }
458 if enc == nil {
459 return r, fmt.Errorf("unknown charset %q", charset)
460 }
461 return enc.NewDecoder().Reader(r), nil
462 },
463}
464
465func parseEnvelope(log mlog.Log, h mail.Header) (*Envelope, error) {
466 date, _ := h.Date()
467
468 // We currently marshal this field to JSON. But JSON cannot represent all
469 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme
470 // years, but we can readjust timezones.
471 // todo: remove this once we no longer store using json.
472 _, offset := date.Zone()
473 if date.Year() > 9999 {
474 date = time.Time{}
475 } else if offset <= -24*3600 || offset >= 24*3600 {
476 date = time.Unix(date.Unix(), 0).UTC()
477 }
478
479 subject := h.Get("Subject")
480 if s, err := wordDecoder.DecodeHeader(subject); err == nil {
481 subject = s
482 }
483
484 env := &Envelope{
485 date,
486 subject,
487 parseAddressList(log, h, "from"),
488 parseAddressList(log, h, "sender"),
489 parseAddressList(log, h, "reply-to"),
490 parseAddressList(log, h, "to"),
491 parseAddressList(log, h, "cc"),
492 parseAddressList(log, h, "bcc"),
493 h.Get("In-Reply-To"),
494 h.Get("Message-Id"),
495 }
496 return env, nil
497}
498
499func parseAddressList(log mlog.Log, h mail.Header, k string) []Address {
500 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382
501 v := h.Get(k)
502 if v == "" {
503 return nil
504 }
505 parser := mail.AddressParser{WordDecoder: &wordDecoder}
506 l, err := parser.ParseList(v)
507 if err != nil {
508 return nil
509 }
510 var r []Address
511 for _, a := range l {
512 // todo: parse more fully according to ../rfc/5322:959
513 var user, host string
514 addr, err := smtp.ParseNetMailAddress(a.Address)
515 if err != nil {
516 log.Infox("parsing address (continuing)", err, slog.Any("netmailaddress", a.Address))
517 } else {
518 user = addr.Localpart.String()
519 host = addr.Domain.ASCII
520 }
521 r = append(r, Address{a.Name, user, host})
522 }
523 return r
524}
525
526// ParseNextPart parses the next (sub)part of this multipart message.
527// ParseNextPart returns io.EOF and a nil part when there are no more parts.
528// Only used for initial parsing of message. Once parsed, use p.Parts.
529func (p *Part) ParseNextPart(elog *slog.Logger) (*Part, error) {
530 log := mlog.New("message", elog)
531
532 if len(p.bound) == 0 {
533 return nil, errNotMultipart
534 }
535 if p.nextBoundOffset == -1 {
536 if enforceSequential {
537 panic("access not sequential")
538 }
539 // Set nextBoundOffset by fully reading the last part.
540 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)
541 if err != nil {
542 return nil, err
543 }
544 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {
545 return nil, err
546 }
547 if p.nextBoundOffset == -1 {
548 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")
549 }
550 }
551 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}
552 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.
553 line, crlf, err := b.ReadLine(false)
554 if err != nil {
555 return nil, err
556 }
557 if match, finish := checkBound(line, p.bound); !match {
558 return nil, fmt.Errorf("expected bound, got %q", line)
559 } else if finish {
560 // Read any trailing data.
561 if p.parent != nil {
562 for {
563 line, _, err := b.PeekLine(false)
564 if err != nil {
565 break
566 }
567 if match, _ := checkBound(line, p.parent.bound); match {
568 break
569 }
570 b.ReadLine(false)
571 }
572 if p.parent.lastBoundOffset == p.BoundaryOffset {
573 p.parent.nextBoundOffset = b.offset
574 }
575 }
576 p.EndOffset = b.offset
577 return nil, io.EOF
578 } else if !crlf {
579 return nil, fmt.Errorf("non-finishing bound without crlf: %w", errUnexpectedEOF)
580 }
581 boundOffset := p.nextBoundOffset
582 p.lastBoundOffset = boundOffset
583 p.nextBoundOffset = -1
584 np, err := newPart(log, p.strict, p.r, boundOffset, p)
585 if err != nil {
586 return nil, err
587 }
588 p.Parts = append(p.Parts, np)
589 return &p.Parts[len(p.Parts)-1], nil
590}
591
592// IsDSN returns whether the MIME structure of the part is a DSN.
593func (p *Part) IsDSN() bool {
594 return p.MediaType == "MULTIPART" &&
595 p.MediaSubType == "REPORT" &&
596 len(p.Parts) >= 2 &&
597 p.Parts[1].MediaType == "MESSAGE" &&
598 (p.Parts[1].MediaSubType == "DELIVERY-STATUS" || p.Parts[1].MediaSubType == "GLOBAL-DELIVERY-STATUS")
599}
600
601// Reader returns a reader for the decoded body content.
602func (p *Part) Reader() io.Reader {
603 return p.bodyReader(p.RawReader())
604}
605
606// ReaderUTF8OrBinary returns a reader for the decoded body content, transformed to
607// utf-8 for known mime/iana encodings (only if they aren't us-ascii or utf-8
608// already). For unknown or missing character sets/encodings, the original reader
609// is returned.
610func (p *Part) ReaderUTF8OrBinary() io.Reader {
611 return DecodeReader(p.ContentTypeParams["charset"], p.Reader())
612}
613
614func (p *Part) bodyReader(r io.Reader) io.Reader {
615 r = newDecoder(p.ContentTransferEncoding, r)
616 if p.MediaType == "TEXT" {
617 return &textReader{p, bufio.NewReader(r), 0, false}
618 }
619 return &countReader{p, r, 0}
620}
621
622// countReader is an io.Reader that passes Reads to the underlying reader.
623// when eof is read, it sets p.DecodedSize to the number of bytes returned.
624type countReader struct {
625 p *Part
626 r io.Reader
627 count int64
628}
629
630func (cr *countReader) Read(buf []byte) (int, error) {
631 n, err := cr.r.Read(buf)
632 if n >= 0 {
633 cr.count += int64(n)
634 }
635 if err == io.EOF {
636 cr.p.DecodedSize = cr.count
637 }
638 return n, err
639}
640
641// textReader is an io.Reader that ensures all lines return end in CRLF.
642// when eof is read from the underlying reader, it sets p.DecodedSize.
643type textReader struct {
644 p *Part
645 r *bufio.Reader
646 count int64
647 prevcr bool // If previous byte returned was a CR.
648}
649
650func (tr *textReader) Read(buf []byte) (int, error) {
651 o := 0
652 for o < len(buf) {
653 c, err := tr.r.ReadByte()
654 if err != nil {
655 tr.count += int64(o)
656 tr.p.DecodedSize = tr.count
657 return o, err
658 }
659 if c == '\n' && !tr.prevcr {
660 buf[o] = '\r'
661 o++
662 tr.prevcr = true
663 tr.r.UnreadByte()
664 continue
665 }
666 buf[o] = c
667 tr.prevcr = c == '\r'
668 o++
669 }
670 tr.count += int64(o)
671 return o, nil
672}
673
674func newDecoder(cte string, r io.Reader) io.Reader {
675 // ../rfc/2045:775
676 switch cte {
677 case "BASE64":
678 return base64.NewDecoder(base64.StdEncoding, r)
679 case "QUOTED-PRINTABLE":
680 return quotedprintable.NewReader(r)
681 }
682 return r
683}
684
685// RawReader returns a reader for the raw, undecoded body content. E.g. with
686// quoted-printable or base64 content intact.
687// Fully reading a part helps its parent part find its next part efficiently.
688func (p *Part) RawReader() io.Reader {
689 if p.r == nil {
690 panic("missing reader")
691 }
692 if p.EndOffset >= 0 {
693 return &crlfReader{strict: p.strict, r: io.NewSectionReader(p.r, p.BodyOffset, p.EndOffset-p.BodyOffset)}
694 }
695 p.RawLineCount = 0
696 if p.parent == nil {
697 return &offsetReader{p, p.BodyOffset, p.strict, true, false, 0}
698 }
699 return &boundReader{p: p, b: &bufAt{strict: p.strict, r: p.r, offset: p.BodyOffset}, prevlf: true}
700}
701
702// crlfReader verifies there are no bare newlines and optionally no bare carriage returns.
703type crlfReader struct {
704 r io.Reader
705 strict bool
706 prevcr bool
707}
708
709func (r *crlfReader) Read(buf []byte) (int, error) {
710 n, err := r.r.Read(buf)
711 if err == nil || err == io.EOF {
712 for _, b := range buf[:n] {
713 if b == '\n' && !r.prevcr {
714 err = errBareLF
715 break
716 } else if b != '\n' && r.prevcr && (r.strict || Pedantic) {
717 err = errBareCR
718 break
719 }
720 r.prevcr = b == '\r'
721 }
722 }
723 return n, err
724}
725
726// bufAt is a buffered reader on an underlying ReaderAt.
727// bufAt verifies that lines end with crlf.
728type bufAt struct {
729 offset int64 // Offset in r currently consumed, i.e. not including any buffered data.
730
731 strict bool
732 r io.ReaderAt
733 buf []byte // Buffered data.
734 nbuf int // Valid bytes in buf.
735 scratch []byte
736}
737
738// Messages should not have lines longer than 78+2 bytes, and must not have
739// lines longer than 998+2 bytes. But in practice they have longer lines. We
740// have a higher limit, but for when parsing with strict we check for the 1000
741// bytes limit.
742// ../rfc/5321:3512
743const maxLineLength = 8 * 1024
744
745func (b *bufAt) maxLineLength() int {
746 if b.strict || Pedantic {
747 return 1000
748 }
749 return maxLineLength
750}
751
752// ensure makes sure b.nbuf is up to maxLineLength, unless eof is encountered.
753func (b *bufAt) ensure() error {
754 for _, c := range b.buf[:b.nbuf] {
755 if c == '\n' {
756 return nil
757 }
758 }
759 if b.scratch == nil {
760 b.scratch = make([]byte, b.maxLineLength())
761 }
762 if b.buf == nil {
763 b.buf = make([]byte, b.maxLineLength())
764 }
765 for b.nbuf < b.maxLineLength() {
766 n, err := b.r.ReadAt(b.buf[b.nbuf:], b.offset+int64(b.nbuf))
767 if n > 0 {
768 b.nbuf += n
769 }
770 if err != nil && err != io.EOF || err == io.EOF && b.nbuf+n == 0 {
771 return err
772 }
773 if n == 0 || err == io.EOF {
774 break
775 }
776 }
777 return nil
778}
779
780// ReadLine reads a line until \r\n is found, returning the line including \r\n.
781// If not found, or a bare \n is encountered, or a bare \r is enountered in pedantic mode, ReadLine returns an error.
782func (b *bufAt) ReadLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
783 return b.line(true, requirecrlf)
784}
785
786func (b *bufAt) PeekLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
787 return b.line(false, requirecrlf)
788}
789
790func (b *bufAt) line(consume, requirecrlf bool) (buf []byte, crlf bool, err error) {
791 if err := b.ensure(); err != nil {
792 return nil, false, err
793 }
794 for i, c := range b.buf[:b.nbuf] {
795 if c == '\n' {
796 // Should have seen a \r, which should have been handled below.
797 return nil, false, errBareLF
798 }
799 if c != '\r' {
800 continue
801 }
802 i++
803 if i >= b.nbuf || b.buf[i] != '\n' {
804 if b.strict || Pedantic {
805 return nil, false, errBareCR
806 }
807 continue
808 }
809 b.scratch = b.scratch[:i+1]
810 copy(b.scratch, b.buf[:i+1])
811 if consume {
812 copy(b.buf, b.buf[i+1:])
813 b.offset += int64(i + 1)
814 b.nbuf -= i + 1
815 }
816 return b.scratch, true, nil
817 }
818 if b.nbuf >= b.maxLineLength() {
819 return nil, false, errLineTooLong
820 }
821 if requirecrlf {
822 return nil, false, errUnexpectedEOF
823 }
824 b.scratch = b.scratch[:b.nbuf]
825 copy(b.scratch, b.buf[:b.nbuf])
826 if consume {
827 b.offset += int64(b.nbuf)
828 b.nbuf = 0
829 }
830 return b.scratch, false, nil
831}
832
833// PeekByte returns the next unread byte, or an error.
834func (b *bufAt) PeekByte() (byte, error) {
835 if err := b.ensure(); err != nil {
836 return 0, err
837 }
838 if b.nbuf == 0 {
839 return 0, io.EOF
840 }
841 return b.buf[0], nil
842}
843
844// offsetReader reads from p.r starting from offset, and RawLineCount on p.
845// offsetReader validates lines end with \r\n.
846type offsetReader struct {
847 p *Part
848 offset int64
849 strict bool
850 prevlf bool
851 prevcr bool
852 linelength int
853}
854
855func (r *offsetReader) Read(buf []byte) (int, error) {
856 n, err := r.p.r.ReadAt(buf, r.offset)
857 if n > 0 {
858 r.offset += int64(n)
859 max := maxLineLength
860 if r.strict || Pedantic {
861 max = 1000
862 }
863
864 for _, c := range buf[:n] {
865 if r.prevlf {
866 r.p.RawLineCount++
867 }
868 if err == nil || err == io.EOF {
869 if c == '\n' && !r.prevcr {
870 err = errBareLF
871 } else if c != '\n' && r.prevcr && (r.strict || Pedantic) {
872 err = errBareCR
873 }
874 }
875 r.prevlf = c == '\n'
876 r.prevcr = c == '\r'
877 r.linelength++
878 if c == '\n' {
879 r.linelength = 0
880 } else if r.linelength > max && err == nil {
881 err = errLineTooLong
882 }
883 }
884 }
885 if err == io.EOF {
886 r.p.EndOffset = r.offset
887 }
888 return n, err
889}
890
891var crlf = []byte("\r\n")
892
893// boundReader is a reader that stops at a closing multipart boundary.
894// boundReader ensures lines end with crlf through its use of bufAt.
895type boundReader struct {
896 p *Part
897 b *bufAt
898 buf []byte // Data from previous line, to be served first.
899 nbuf int // Number of valid bytes in buf.
900 crlf []byte // Possible crlf, to be returned if we do not yet encounter a boundary.
901 prevlf bool // If last char returned was a newline. For counting lines.
902}
903
904func (b *boundReader) Read(buf []byte) (count int, rerr error) {
905 origBuf := buf
906 defer func() {
907 if count > 0 {
908 for _, c := range origBuf[:count] {
909 if b.prevlf {
910 b.p.RawLineCount++
911 }
912 b.prevlf = c == '\n'
913 }
914 }
915 }()
916
917 for {
918 // Read data from earlier line.
919 if b.nbuf > 0 {
920 n := b.nbuf
921 if n > len(buf) {
922 n = len(buf)
923 }
924 copy(buf, b.buf[:n])
925 copy(b.buf, b.buf[n:])
926 buf = buf[n:]
927 b.nbuf -= n
928 count += n
929 if b.nbuf > 0 {
930 break
931 }
932 }
933
934 // Look at next line. If it is a boundary, we are done and won't serve the crlf from the last line.
935 line, _, err := b.b.PeekLine(false)
936 if match, _ := checkBound(line, b.p.parent.bound); match {
937 b.p.EndOffset = b.b.offset - int64(len(b.crlf))
938 if b.p.parent.lastBoundOffset == b.p.BoundaryOffset {
939 b.p.parent.nextBoundOffset = b.b.offset
940 } else if enforceSequential {
941 panic("access not sequential")
942 }
943 return count, io.EOF
944 }
945 if err == io.EOF {
946 err = errMissingClosingBoundary
947 }
948 if err != nil && err != io.EOF {
949 return count, err
950 }
951 if len(b.crlf) > 0 {
952 n := len(b.crlf)
953 if n > len(buf) {
954 n = len(buf)
955 }
956 copy(buf, b.crlf[:n])
957 count += n
958 buf = buf[n:]
959 b.crlf = b.crlf[n:]
960 }
961 if len(buf) == 0 {
962 break
963 }
964 line, _, err = b.b.ReadLine(true)
965 if err != nil {
966 // Could be an unexpected end of the part.
967 return 0, err
968 }
969 b.crlf = crlf // crlf will be read next time, but not if a boundary follows.
970 n := len(line) - 2
971 line = line[:n]
972 if n > len(buf) {
973 n = len(buf)
974 }
975 copy(buf, line[:n])
976 count += n
977 buf = buf[n:]
978 line = line[n:]
979 if len(line) > 0 {
980 if b.buf == nil {
981 b.buf = make([]byte, b.b.maxLineLength())
982 }
983 copy(b.buf, line)
984 b.nbuf = len(line)
985 }
986 }
987 return count, nil
988}
989
990func checkBound(line, bound []byte) (bool, bool) {
991 if !bytes.HasPrefix(line, bound) {
992 return false, false
993 }
994 line = line[len(bound):]
995 if bytes.HasPrefix(line, []byte("--")) {
996 return true, true
997 }
998 if len(line) == 0 {
999 return true, false
1000 }
1001 c := line[0]
1002 switch c {
1003 case ' ', '\t', '\r', '\n':
1004 return true, false
1005 }
1006 return false, false
1007}
1008