1package message

3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").

4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...

5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.

6// todo: handle comments in headers?

7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.

8// todo: can use a cleanup

10import (

11 "bufio"

12 "bytes"

13 "encoding/base64"

14 "errors"

15 "fmt"

16 "io"

17 "log/slog"

18 "mime"

19 "mime/quotedprintable"

20 "net/mail"

21 "net/textproto"

22 "strings"

23 "time"

25 "golang.org/x/text/encoding/ianaindex"

27 "github.com/mjl-/mox/mlog"

28 "github.com/mjl-/mox/smtp"

29)

31// Pedantic enables stricter parsing.

32var Pedantic bool

34var (

35 ErrBadContentType = errors.New("bad content-type")

36)

38var (

39 errNotMultipart = errors.New("not a multipart message")

40 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")

41 errLineTooLong = errors.New("line too long")

42 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")

43 errMissingClosingBoundary = errors.New("eof without closing boundary")

44 errBareLF = errors.New("invalid bare line feed")

45 errBareCR = errors.New("invalid bare carriage return")

46 errUnexpectedEOF = errors.New("unexpected eof")

47)

49// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.

50var enforceSequential bool

52// Part represents a whole mail message, or a part of a multipart message. It

53// is designed to handle IMAP requirements efficiently.

54type Part struct {

55 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.

56 HeaderOffset int64 // Offset in message file where header starts.

57 BodyOffset int64 // Offset in message file where body starts.

58 EndOffset int64 // Where body of part ends. Set when part is fully read.

59 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.

60 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.

62 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.

63 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".

64 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.

65 ContentID string

66 ContentDescription string

67 ContentTransferEncoding string // In upper case.

68 Envelope *Envelope // Email message headers. Not for non-message parts.

70 Parts []Part // Parts if this is a multipart.

72 // Only for message/rfc822 and message/global. This part may have a buffer as

73 // backing io.ReaderAt, because a message/global can have a non-identity

74 // content-transfer-encoding. This part has a nil parent.

75 Message *Part

77 r io.ReaderAt

78 header textproto.MIMEHeader // Parsed header.

79 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.

80 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.

81 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.

82 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.

83 strict bool // If set, valid crlf line endings are verified when reading body.

84}

86// todo: have all Content* fields in Part?

87// todo: make Address contain a type Localpart and dns.Domain?

88// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.

90// Envelope holds the basic/common message headers as used in IMAP4.

91type Envelope struct {

92 Date time.Time

93 Subject string // Q/B-word-decoded.

94 From []Address

95 Sender []Address

96 ReplyTo []Address

97 To []Address

98 CC []Address

99 BCC []Address

100 InReplyTo string // From In-Reply-To header, includes <>.

101 MessageID string // From Message-Id header, includes <>.

102}

103

104// Address as used in From and To headers.

105type Address struct {

106 Name string // Free-form name for display in mail applications.

107 User string // Localpart, encoded as string. Must be parsed before using as Localpart.

108 Host string // Domain in ASCII.

109}

110

111// Parse reads the headers of the mail message and returns a part.

112// A part provides access to decoded and raw contents of a message and its multiple parts.

113//

114// If strict is set, fewer attempts are made to continue parsing when errors are

115// encountered, such as with invalid content-type headers or bare carriage returns.

116func Parse(elog *slog.Logger, strict bool, r io.ReaderAt) (Part, error) {

117 log := mlog.New("message", elog)

118 return newPart(log, strict, r, 0, nil)

119}

120

121// EnsurePart parses a part as with Parse, but ensures a usable part is always

122// returned, even if error is non-nil. If a parse error occurs, the message is

123// returned as application/octet-stream, and headers can still be read if they

124// were valid.

125//

126// If strict is set, fewer attempts are made to continue parsing when errors are

127// encountered, such as with invalid content-type headers or bare carriage returns.

128func EnsurePart(elog *slog.Logger, strict bool, r io.ReaderAt, size int64) (Part, error) {

129 log := mlog.New("message", elog)

130 p, err := Parse(log.Logger, strict, r)

131 if err == nil {

132 err = p.Walk(log.Logger, nil)

133 }

134 if err != nil {

135 np, err2 := fallbackPart(p, r, size)

136 if err2 != nil {

137 err = err2

138 }

139 p = np

140 }

141 return p, err

142}

143

144func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {

145 np := Part{

146 HeaderOffset: p.HeaderOffset,

147 BodyOffset: p.BodyOffset,

148 EndOffset: size,

149 MediaType: "APPLICATION",

150 MediaSubType: "OCTET-STREAM",

151 ContentTypeParams: p.ContentTypeParams,

152 ContentID: p.ContentID,

153 ContentDescription: p.ContentDescription,

154 ContentTransferEncoding: p.ContentTransferEncoding,

155 Envelope: p.Envelope,

156 // We don't keep:

157 // - BoundaryOffset: irrelevant for top-level message.

158 // - RawLineCount and DecodedSize: set below.

159 // - Parts: we are not treating this as a multipart message.

160 }

161 np.SetReaderAt(r)

162 // By reading body, the number of lines and decoded size will be set.

163 _, err := io.Copy(io.Discard, np.Reader())

164 return np, err

165}

166

167// SetReaderAt sets r as reader for this part and all its sub parts, recursively.

168// No reader is set for any Message subpart, see SetMessageReaderAt.

169func (p *Part) SetReaderAt(r io.ReaderAt) {

170 if r == nil {

171 panic("nil reader")

172 }

173 p.r = r

174 for i := range p.Parts {

175 pp := &p.Parts[i]

176 pp.SetReaderAt(r)

177 }

178}

179

180// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.

181func (p *Part) SetMessageReaderAt() error {

182 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.

183 buf, err := io.ReadAll(p.Reader())

184 if err != nil {

185 return err

186 }

187 p.Message.SetReaderAt(bytes.NewReader(buf))

188 return nil

189}

190

191// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.

192func (p *Part) Walk(elog *slog.Logger, parent *Part) error {

193 log := mlog.New("message", elog)

194

195 if len(p.bound) == 0 {

196 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {

197 // todo: don't read whole submessage in memory...

198 buf, err := io.ReadAll(p.Reader())

199 if err != nil {

200 return err

201 }

202 br := bytes.NewReader(buf)

203 mp, err := Parse(log.Logger, p.strict, br)

204 if err != nil {

205 return fmt.Errorf("parsing embedded message: %w", err)

206 }

207 if err := mp.Walk(log.Logger, nil); err != nil {

208 // If this is a DSN and we are not in pedantic mode, accept unexpected end of

209 // message. This is quite common because MTA's sometimes just truncate the original

210 // message in a place that makes the message invalid.

211 if errors.Is(err, errUnexpectedEOF) && !Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {

212 mp, err = fallbackPart(mp, br, int64(len(buf)))

213 if err != nil {

214 return fmt.Errorf("parsing invalid embedded message: %w", err)

215 }

216 } else {

217 return fmt.Errorf("parsing parts of embedded message: %w", err)

218 }

219 }

220 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.

221 p.Message = &mp

222 return nil

223 }

224 _, err := io.Copy(io.Discard, p.Reader())

225 return err

226 }

227

228 for {

229 pp, err := p.ParseNextPart(log.Logger)

230 if err == io.EOF {

231 return nil

232 }

233 if err != nil {

234 return err

235 }

236 if err := pp.Walk(log.Logger, p); err != nil {

237 return err

238 }

239 }

240}

241

242// String returns a debugging representation of the part.

243func (p *Part) String() string {

244 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)

245}

246

247// newPart parses a new part, which can be the top-level message.

248// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.

249// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.

250func newPart(log mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {

251 if r == nil {

252 panic("nil reader")

253 }

254 p = Part{

255 BoundaryOffset: -1,

256 EndOffset: -1,

257 r: r,

258 parent: parent,

259 strict: strict,

260 }

261

262 b := &bufAt{strict: strict, r: r, offset: offset}

263

264 if parent != nil {

265 p.BoundaryOffset = offset

266 if line, _, err := b.ReadLine(true); err != nil {

267 return p, err

268 } else if match, finish := checkBound(line, parent.bound); !match {

269 return p, fmt.Errorf("missing bound")

270 } else if finish {

271 return p, fmt.Errorf("new part for closing boundary")

272 }

273 }

274

275 // Collect header.

276 p.HeaderOffset = b.offset

277 p.BodyOffset = b.offset

278 hb := &bytes.Buffer{}

279 for {

280 line, _, err := b.ReadLine(true)

281 if err == io.EOF {

282 // No body is valid.

283 break

284 }

285 if err != nil {

286 return p, fmt.Errorf("reading header line: %w", err)

287 }

288 hb.Write(line)

289 if len(line) == 2 {

290 break // crlf

291 }

292 }

293 p.BodyOffset = b.offset

294

295 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.

296 if p.HeaderOffset == p.BodyOffset {

297 p.header = textproto.MIMEHeader{}

298 } else {

299 h, err := parseHeader(hb)

300 if err != nil {

301 return p, fmt.Errorf("parsing header: %w", err)

302 }

303 p.header = h

304 }

305

306 ct := p.header.Get("Content-Type")

307 mt, params, err := mime.ParseMediaType(ct)

308 if err != nil && ct != "" {

309 if Pedantic || strict {

310 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)

311 }

312

313 // Try parsing just a content-type, ignoring parameters.

314 // ../rfc/2045:628

315 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])

316 t := strings.SplitN(ct, "/", 2)

317 isToken := func(s string) bool {

318 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663

319 for _, c := range s {

320 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {

321 return false

322 }

323 }

324 return len(s) > 0

325 }

326 // We cannot recover content-type of multipart, we won't have a boundary.

327 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {

328 p.MediaType = strings.ToUpper(t[0])

329 p.MediaSubType = strings.ToUpper(t[1])

330 } else {

331 p.MediaType = "APPLICATION"

332 p.MediaSubType = "OCTET-STREAM"

333 }

334 log.Debugx("malformed content-type, attempting to recover and continuing", err,

335 slog.String("contenttype", p.header.Get("Content-Type")),

336 slog.String("mediatype", p.MediaType),

337 slog.String("mediasubtype", p.MediaSubType))

338 } else if mt != "" {

339 t := strings.SplitN(strings.ToUpper(mt), "/", 2)

340 if len(t) != 2 {

341 if Pedantic || strict {

342 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)

343 }

344 log.Debug("malformed media-type, ignoring and continuing", slog.String("type", mt))

345 p.MediaType = "APPLICATION"

346 p.MediaSubType = "OCTET-STREAM"

347 } else {

348 p.MediaType = t[0]

349 p.MediaSubType = t[1]

350 p.ContentTypeParams = params

351 }

352 }

353

354 p.ContentID = p.header.Get("Content-Id")

355 p.ContentDescription = p.header.Get("Content-Description")

356 p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))

357

358 if parent == nil {

359 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))

360 if err != nil {

361 return p, err

362 }

363 }

364

365 if p.MediaType == "MULTIPART" {

366 s := params["boundary"]

367 if s == "" {

368 return p, errMissingBoundaryParam

369 }

370 p.bound = append([]byte("--"), s...)

371

372 // Discard preamble, before first boundary.

373 for {

374 line, _, err := b.PeekLine(true)

375 if err != nil {

376 return p, fmt.Errorf("parsing line for part preamble: %w", err)

377 }

378 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103

379 // Well, for compatibility, we require whitespace after the boundary. Because some

380 // software use the same boundary but with text appended for sub parts.

381 if match, finish := checkBound(line, p.bound); match {

382 if finish {

383 return p, errFirstBoundCloses

384 }

385 break

386 }

387 b.ReadLine(true)

388 }

389 p.nextBoundOffset = b.offset

390 p.lastBoundOffset = b.offset

391 }

392

393 return p, nil

394}

395

396// Header returns the parsed header of this part.

397func (p *Part) Header() (textproto.MIMEHeader, error) {

398 if p.header != nil {

399 return p.header, nil

400 }

401 if p.HeaderOffset == p.BodyOffset {

402 p.header = textproto.MIMEHeader{}

403 return p.header, nil

404 }

405 h, err := parseHeader(p.HeaderReader())

406 p.header = h

407 return h, err

408}

409

410// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.

411func (p *Part) HeaderReader() io.Reader {

412 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)

413}

414

415// parse a header, only call this on non-empty input (even though that is a valid header).

416func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {

417 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the

418 // first handles email messages properly, while the second only works for HTTP

419 // headers.

420 var zero textproto.MIMEHeader

421

422 // We read the header and add the optional \r\n header/body separator. If the \r\n

423 // is missing, parsing with Go <1.21 results in an EOF error.

424 // todo: directly parse from reader r when Go 1.20 is no longer supported.

425 buf, err := io.ReadAll(r)

426 if err != nil {

427 return zero, err

428 }

429 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {

430 buf = append(buf, "\r\n"...)

431 }

432 msg, err := mail.ReadMessage(bytes.NewReader(buf))

433 if err != nil {

434 return zero, err

435 }

436 return textproto.MIMEHeader(msg.Header), nil

437}

438

439var wordDecoder = mime.WordDecoder{

440 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {

441 switch strings.ToLower(charset) {

442 case "", "us-ascii", "utf-8":

443 return r, nil

444 }

445 enc, _ := ianaindex.MIME.Encoding(charset)

446 if enc == nil {

447 enc, _ = ianaindex.IANA.Encoding(charset)

448 }

449 if enc == nil {

450 return r, fmt.Errorf("unknown charset %q", charset)

451 }

452 return enc.NewDecoder().Reader(r), nil

453 },

454}

455

456func parseEnvelope(log mlog.Log, h mail.Header) (*Envelope, error) {

457 date, _ := h.Date()

458

459 // We currently marshal this field to JSON. But JSON cannot represent all

460 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme

461 // years, but we can readjust timezones.

462 // todo: remove this once we no longer store using json.

463 _, offset := date.Zone()

464 if date.Year() > 9999 {

465 date = time.Time{}

466 } else if offset <= -24*3600 || offset >= 24*3600 {

467 date = time.Unix(date.Unix(), 0).UTC()

468 }

469

470 subject := h.Get("Subject")

471 if s, err := wordDecoder.DecodeHeader(subject); err == nil {

472 subject = s

473 }

474

475 env := &Envelope{

476 date,

477 subject,

478 parseAddressList(log, h, "from"),

479 parseAddressList(log, h, "sender"),

480 parseAddressList(log, h, "reply-to"),

481 parseAddressList(log, h, "to"),

482 parseAddressList(log, h, "cc"),

483 parseAddressList(log, h, "bcc"),

484 h.Get("In-Reply-To"),

485 h.Get("Message-Id"),

486 }

487 return env, nil

488}

489

490func parseAddressList(log mlog.Log, h mail.Header, k string) []Address {

491 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382

492 v := h.Get(k)

493 if v == "" {

494 return nil

495 }

496 parser := mail.AddressParser{WordDecoder: &wordDecoder}

497 l, err := parser.ParseList(v)

498 if err != nil {

499 return nil

500 }

501 var r []Address

502 for _, a := range l {

503 // todo: parse more fully according to ../rfc/5322:959

504 var user, host string

505 addr, err := smtp.ParseNetMailAddress(a.Address)

506 if err != nil {

507 log.Infox("parsing address (continuing)", err, slog.Any("netmailaddress", a.Address))

508 } else {

509 user = addr.Localpart.String()

510 host = addr.Domain.ASCII

511 }

512 r = append(r, Address{a.Name, user, host})

513 }

514 return r

515}

516

517// ParseNextPart parses the next (sub)part of this multipart message.

518// ParseNextPart returns io.EOF and a nil part when there are no more parts.

519// Only used for initial parsing of message. Once parsed, use p.Parts.

520func (p *Part) ParseNextPart(elog *slog.Logger) (*Part, error) {

521 log := mlog.New("message", elog)

522

523 if len(p.bound) == 0 {

524 return nil, errNotMultipart

525 }

526 if p.nextBoundOffset == -1 {

527 if enforceSequential {

528 panic("access not sequential")

529 }

530 // Set nextBoundOffset by fully reading the last part.

531 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)

532 if err != nil {

533 return nil, err

534 }

535 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {

536 return nil, err

537 }

538 if p.nextBoundOffset == -1 {

539 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")

540 }

541 }

542 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}

543 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.

544 line, crlf, err := b.ReadLine(false)

545 if err != nil {

546 return nil, err

547 }

548 if match, finish := checkBound(line, p.bound); !match {

549 return nil, fmt.Errorf("expected bound, got %q", line)

550 } else if finish {

551 // Read any trailing data.

552 if p.parent != nil {

553 for {

554 line, _, err := b.PeekLine(false)

555 if err != nil {