1package message

3// todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").

4// todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...

5// todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.

6// todo: handle comments in headers?

7// todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.

8// todo: can use a cleanup

10import (

11 "bufio"

12 "bytes"

13 "encoding/base64"

14 "errors"

15 "fmt"

16 "io"

17 "log/slog"

18 "mime"

19 "mime/quotedprintable"

20 "net/mail"

21 "net/textproto"

22 "strings"

23 "time"

25 "golang.org/x/text/encoding/ianaindex"

27 "github.com/mjl-/mox/mlog"

28 "github.com/mjl-/mox/smtp"

29)

31// Pedantic enables stricter parsing.

32var Pedantic bool

34var (

35 ErrBadContentType = errors.New("bad content-type")

36 ErrHeader = errors.New("bad message header")

37)

39var (

40 errNotMultipart = errors.New("not a multipart message")

41 errFirstBoundCloses = errors.New("first boundary cannot be finishing boundary")

42 errLineTooLong = errors.New("line too long")

43 errMissingBoundaryParam = errors.New("missing/empty boundary content-type parameter")

44 errMissingClosingBoundary = errors.New("eof without closing boundary")

45 errBareLF = errors.New("invalid bare line feed")

46 errBareCR = errors.New("invalid bare carriage return")

47 errUnexpectedEOF = errors.New("unexpected eof")

48)

50// If set, during tests, attempts to reparse a part will cause an error, because sequentially reading parts should not lead to reparsing.

51var enforceSequential bool

53// Part represents a whole mail message, or a part of a multipart message. It

54// is designed to handle IMAP requirements efficiently.

55type Part struct {

56 BoundaryOffset int64 // Offset in message where bound starts. -1 for top-level message.

57 HeaderOffset int64 // Offset in message file where header starts.

58 BodyOffset int64 // Offset in message file where body starts.

59 EndOffset int64 // Where body of part ends. Set when part is fully read.

60 RawLineCount int64 // Number of lines in raw, undecoded, body of part. Set when part is fully read.

61 DecodedSize int64 // Number of octets when decoded. If this is a text mediatype, lines ending only in LF are changed end in CRLF and DecodedSize reflects that.

63 MediaType string // From Content-Type, upper case. E.g. "TEXT". Can be empty because content-type may be absent. In this case, the part may be treated as TEXT/PLAIN.

64 MediaSubType string // From Content-Type, upper case. E.g. "PLAIN".

65 ContentTypeParams map[string]string // E.g. holds "boundary" for multipart messages. Has lower-case keys, and original case values.

66 ContentID string

67 ContentDescription string

68 ContentTransferEncoding string // In upper case.

69 Envelope *Envelope // Email message headers. Not for non-message parts.

71 Parts []Part // Parts if this is a multipart.

73 // Only for message/rfc822 and message/global. This part may have a buffer as

74 // backing io.ReaderAt, because a message/global can have a non-identity

75 // content-transfer-encoding. This part has a nil parent.

76 Message *Part

78 r io.ReaderAt

79 header textproto.MIMEHeader // Parsed header.

80 nextBoundOffset int64 // If >= 0, the offset where the next part header starts. We can set this when a user fully reads each part.

81 lastBoundOffset int64 // Start of header of last/previous part. Used to skip a part if ParseNextPart is called and nextBoundOffset is -1.

82 parent *Part // Parent part, for getting bound from, and setting nextBoundOffset when a part has finished reading. Only for subparts, not top-level parts.

83 bound []byte // Only set if valid multipart with boundary, includes leading --, excludes \r\n.

84 strict bool // If set, valid crlf line endings are verified when reading body.

85}

87// todo: have all Content* fields in Part?

88// todo: make Address contain a type Localpart and dns.Domain?

89// todo: if we ever make a major change and reparse all parts, switch to lower-case values if not too troublesome.

91// Envelope holds the basic/common message headers as used in IMAP4.

92type Envelope struct {

93 Date time.Time

94 Subject string // Q/B-word-decoded.

95 From []Address

96 Sender []Address

97 ReplyTo []Address

98 To []Address

99 CC []Address

100 BCC []Address

101 InReplyTo string // From In-Reply-To header, includes <>.

102 MessageID string // From Message-Id header, includes <>.

103}

104

105// Address as used in From and To headers.

106type Address struct {

107 Name string // Free-form name for display in mail applications.

108 User string // Localpart, encoded as string. Must be parsed before using as Localpart.

109 Host string // Domain in ASCII.

110}

111

112// Parse reads the headers of the mail message and returns a part.

113// A part provides access to decoded and raw contents of a message and its multiple parts.

114//

115// If strict is set, fewer attempts are made to continue parsing when errors are

116// encountered, such as with invalid content-type headers or bare carriage returns.

117func Parse(elog *slog.Logger, strict bool, r io.ReaderAt) (Part, error) {

118 log := mlog.New("message", elog)

119 return newPart(log, strict, r, 0, nil)

120}

121

122// EnsurePart parses a part as with Parse, but ensures a usable part is always

123// returned, even if error is non-nil. If a parse error occurs, the message is

124// returned as application/octet-stream, and headers can still be read if they

125// were valid.

126//

127// If strict is set, fewer attempts are made to continue parsing when errors are

128// encountered, such as with invalid content-type headers or bare carriage returns.

129func EnsurePart(elog *slog.Logger, strict bool, r io.ReaderAt, size int64) (Part, error) {

130 log := mlog.New("message", elog)

131 p, err := Parse(log.Logger, strict, r)

132 if err == nil {

133 err = p.Walk(log.Logger, nil)

134 }

135 if err != nil {

136 np, err2 := fallbackPart(p, r, size)

137 if err2 != nil {

138 err = err2

139 }

140 p = np

141 }

142 return p, err

143}

144

145func fallbackPart(p Part, r io.ReaderAt, size int64) (Part, error) {

146 np := Part{

147 HeaderOffset: p.HeaderOffset,

148 BodyOffset: p.BodyOffset,

149 EndOffset: size,

150 MediaType: "APPLICATION",

151 MediaSubType: "OCTET-STREAM",

152 ContentTypeParams: p.ContentTypeParams,

153 ContentID: p.ContentID,

154 ContentDescription: p.ContentDescription,

155 ContentTransferEncoding: p.ContentTransferEncoding,

156 Envelope: p.Envelope,

157 // We don't keep:

158 // - BoundaryOffset: irrelevant for top-level message.

159 // - RawLineCount and DecodedSize: set below.

160 // - Parts: we are not treating this as a multipart message.

161 }

162 np.SetReaderAt(r)

163 // By reading body, the number of lines and decoded size will be set.

164 _, err := io.Copy(io.Discard, np.Reader())

165 return np, err

166}

167

168// SetReaderAt sets r as reader for this part and all its sub parts, recursively.

169// No reader is set for any Message subpart, see SetMessageReaderAt.

170func (p *Part) SetReaderAt(r io.ReaderAt) {

171 if r == nil {

172 panic("nil reader")

173 }

174 p.r = r

175 for i := range p.Parts {

176 pp := &p.Parts[i]

177 pp.SetReaderAt(r)

178 }

179}

180

181// SetMessageReaderAt sets a reader on p.Message, which must be non-nil.

182func (p *Part) SetMessageReaderAt() error {

183 // todo: if p.Message does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.Message, recursively.

184 buf, err := io.ReadAll(p.Reader())

185 if err != nil {

186 return err

187 }

188 p.Message.SetReaderAt(bytes.NewReader(buf))

189 return nil

190}

191

192// Walk through message, decoding along the way, and collecting mime part offsets and sizes, and line counts.

193func (p *Part) Walk(elog *slog.Logger, parent *Part) error {

194 log := mlog.New("message", elog)

195

196 if len(p.bound) == 0 {

197 if p.MediaType == "MESSAGE" && (p.MediaSubType == "RFC822" || p.MediaSubType == "GLOBAL") {

198 // todo: don't read whole submessage in memory...

199 buf, err := io.ReadAll(p.Reader())

200 if err != nil {

201 return err

202 }

203 br := bytes.NewReader(buf)

204 mp, err := Parse(log.Logger, p.strict, br)

205 if err != nil {

206 return fmt.Errorf("parsing embedded message: %w", err)

207 }

208 if err := mp.Walk(log.Logger, nil); err != nil {

209 // If this is a DSN and we are not in pedantic mode, accept unexpected end of

210 // message. This is quite common because MTA's sometimes just truncate the original

211 // message in a place that makes the message invalid.

212 if errors.Is(err, errUnexpectedEOF) && !Pedantic && parent != nil && len(parent.Parts) >= 3 && p == &parent.Parts[2] && parent.MediaType == "MULTIPART" && parent.MediaSubType == "REPORT" {

213 mp, err = fallbackPart(mp, br, int64(len(buf)))

214 if err != nil {

215 return fmt.Errorf("parsing invalid embedded message: %w", err)

216 }

217 } else {

218 return fmt.Errorf("parsing parts of embedded message: %w", err)

219 }

220 }

221 // todo: if mp does not contain any non-identity content-transfer-encoding, we should set an offsetReader of p.r on mp, recursively.

222 p.Message = &mp

223 return nil

224 }

225 _, err := io.Copy(io.Discard, p.Reader())

226 return err

227 }

228

229 for {

230 pp, err := p.ParseNextPart(log.Logger)

231 if err == io.EOF {

232 return nil

233 }

234 if err != nil {

235 return err

236 }

237 if err := pp.Walk(log.Logger, p); err != nil {

238 return err

239 }

240 }

241}

242

243// String returns a debugging representation of the part.

244func (p *Part) String() string {

245 return fmt.Sprintf("&Part{%s/%s offsets %d/%d/%d/%d lines %d decodedsize %d next %d last %d bound %q parts %v}", p.MediaType, p.MediaSubType, p.BoundaryOffset, p.HeaderOffset, p.BodyOffset, p.EndOffset, p.RawLineCount, p.DecodedSize, p.nextBoundOffset, p.lastBoundOffset, p.bound, p.Parts)

246}

247

248// newPart parses a new part, which can be the top-level message.

249// offset is the bound offset for parts, and the start of message for top-level messages. parent indicates if this is a top-level message or sub-part.

250// If an error occurs, p's exported values can still be relevant. EnsurePart uses these values.

251func newPart(log mlog.Log, strict bool, r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {

252 if r == nil {

253 panic("nil reader")

254 }

255 p = Part{

256 BoundaryOffset: -1,

257 EndOffset: -1,

258 r: r,

259 parent: parent,

260 strict: strict,

261 }

262

263 b := &bufAt{strict: strict, r: r, offset: offset}

264

265 if parent != nil {

266 p.BoundaryOffset = offset

267 if line, _, err := b.ReadLine(true); err != nil {

268 return p, err

269 } else if match, finish := checkBound(line, parent.bound); !match {

270 return p, fmt.Errorf("missing bound")

271 } else if finish {

272 return p, fmt.Errorf("new part for closing boundary")

273 }

274 }

275

276 // Collect header.

277 p.HeaderOffset = b.offset

278 p.BodyOffset = b.offset

279 hb := &bytes.Buffer{}

280 for {

281 line, _, err := b.ReadLine(true)

282 if err == io.EOF {

283 // No body is valid.

284 break

285 }

286 if err != nil {

287 return p, fmt.Errorf("reading header line: %w", err)

288 }

289 hb.Write(line)

290 if len(line) == 2 {

291 break // crlf

292 }

293 }

294 p.BodyOffset = b.offset

295

296 // Don't attempt to parse empty header, mail.ReadMessage doesn't like it.

297 if p.HeaderOffset == p.BodyOffset {

298 p.header = textproto.MIMEHeader{}

299 } else {

300 h, err := parseHeader(hb)

301 if err != nil {

302 return p, fmt.Errorf("parsing header: %w", err)

303 }

304 p.header = h

305 }

306

307 ct := p.header.Get("Content-Type")

308 mt, params, err := mime.ParseMediaType(ct)

309 if err != nil && ct != "" {

310 if Pedantic || strict {

311 return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)

312 }

313

314 // Try parsing just a content-type, ignoring parameters.

315 // ../rfc/2045:628

316 ct = strings.TrimSpace(strings.SplitN(ct, ";", 2)[0])

317 t := strings.SplitN(ct, "/", 2)

318 isToken := func(s string) bool {

319 const separators = `()<>@,;:\\"/[]?= ` // ../rfc/2045:663

320 for _, c := range s {

321 if c < 0x20 || c >= 0x80 || strings.ContainsRune(separators, c) {

322 return false

323 }

324 }

325 return len(s) > 0

326 }

327 // We cannot recover content-type of multipart, we won't have a boundary.

328 if len(t) == 2 && isToken(t[0]) && !strings.EqualFold(t[0], "multipart") && isToken(t[1]) {

329 p.MediaType = strings.ToUpper(t[0])

330 p.MediaSubType = strings.ToUpper(t[1])

331 } else {

332 p.MediaType = "APPLICATION"

333 p.MediaSubType = "OCTET-STREAM"

334 }

335 log.Debugx("malformed content-type, attempting to recover and continuing", err,

336 slog.String("contenttype", p.header.Get("Content-Type")),

337 slog.String("mediatype", p.MediaType),

338 slog.String("mediasubtype", p.MediaSubType))

339 } else if mt != "" {

340 t := strings.SplitN(strings.ToUpper(mt), "/", 2)

341 if len(t) != 2 {

342 if Pedantic || strict {

343 return p, fmt.Errorf("bad content-type: %q (content-type %q)", mt, ct)

344 }

345 log.Debug("malformed media-type, ignoring and continuing", slog.String("type", mt))

346 p.MediaType = "APPLICATION"

347 p.MediaSubType = "OCTET-STREAM"

348 } else {

349 p.MediaType = t[0]

350 p.MediaSubType = t[1]

351 p.ContentTypeParams = params

352 }

353 }

354

355 p.ContentID = p.header.Get("Content-Id")

356 p.ContentDescription = p.header.Get("Content-Description")

357 p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))

358

359 if parent == nil {

360 p.Envelope, err = parseEnvelope(log, mail.Header(p.header))

361 if err != nil {

362 return p, err

363 }

364 }

365

366 if p.MediaType == "MULTIPART" {

367 s := params["boundary"]

368 if s == "" {

369 return p, errMissingBoundaryParam

370 }

371 p.bound = append([]byte("--"), s...)

372

373 // Discard preamble, before first boundary.

374 for {

375 line, _, err := b.PeekLine(true)

376 if err != nil {

377 return p, fmt.Errorf("parsing line for part preamble: %w", err)

378 }

379 // Line only needs boundary prefix, not exact match. ../rfc/2046:1103

380 // Well, for compatibility, we require whitespace after the boundary. Because some

381 // software use the same boundary but with text appended for sub parts.

382 if match, finish := checkBound(line, p.bound); match {

383 if finish {

384 return p, errFirstBoundCloses

385 }

386 break

387 }

388 b.ReadLine(true)

389 }

390 p.nextBoundOffset = b.offset

391 p.lastBoundOffset = b.offset

392 }

393

394 return p, nil

395}

396

397// Header returns the parsed header of this part.

398//

399// Returns a ErrHeader for messages with invalid header syntax.

400func (p *Part) Header() (textproto.MIMEHeader, error) {

401 if p.header != nil {

402 return p.header, nil

403 }

404 if p.HeaderOffset == p.BodyOffset {

405 p.header = textproto.MIMEHeader{}

406 return p.header, nil

407 }

408 h, err := parseHeader(p.HeaderReader())

409 p.header = h

410 return h, err

411}

412

413// HeaderReader returns a reader for the header section of this part, including ending bare CRLF.

414func (p *Part) HeaderReader() io.Reader {

415 return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)

416}

417

418// parse a header, only call this on non-empty input (even though that is a valid header).

419func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {

420 // We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the

421 // first handles email messages properly, while the second only works for HTTP

422 // headers.

423 var zero textproto.MIMEHeader

424

425 // We read the header and add the optional \r\n header/body separator. If the \r\n

426 // is missing, parsing with Go <1.21 results in an EOF error.

427 // todo: directly parse from reader r when Go 1.20 is no longer supported.

428 buf, err := io.ReadAll(r)

429 if err != nil {

430 return zero, err

431 }

432 if bytes.HasSuffix(buf, []byte("\r\n")) && !bytes.HasSuffix(buf, []byte("\r\n\r\n")) {

433 buf = append(buf, "\r\n"...)

434 }

435 msg, err := mail.ReadMessage(bytes.NewReader(buf))

436 if err != nil {

437 // Recognize parsing errors from net/mail.ReadMessage.

438 // todo: replace with own message parsing code that returns proper error types.

439 errstr := err.Error()

440 if strings.HasPrefix(errstr, "malformed initial line:") || strings.HasPrefix(errstr, "malformed header line:") {

441 err = fmt.Errorf("%w: %v", ErrHeader, err)

442 }

443 return zero, err

444 }

445 return textproto.MIMEHeader(msg.Header), nil

446}

447

448var wordDecoder = mime.WordDecoder{

449 CharsetReader: func(charset string, r io.Reader) (io.Reader, error) {

450 switch strings.ToLower(charset) {

451 case "", "us-ascii", "utf-8":

452 return r, nil

453 }

454 enc, _ := ianaindex.MIME.Encoding(charset)

455 if enc == nil {

456 enc, _ = ianaindex.IANA.Encoding(charset)

457 }

458 if enc == nil {

459 return r, fmt.Errorf("unknown charset %q", charset)

460 }

461 return enc.NewDecoder().Reader(r), nil

462 },

463}

464

465func parseEnvelope(log mlog.Log, h mail.Header) (*Envelope, error) {

466 date, _ := h.Date()

467

468 // We currently marshal this field to JSON. But JSON cannot represent all

469 // time.Time. Time zone of 24:00 was seen in the wild. We won't try for extreme

470 // years, but we can readjust timezones.

471 // todo: remove this once we no longer store using json.

472 _, offset := date.Zone()

473 if date.Year() > 9999 {

474 date = time.Time{}

475 } else if offset <= -24*3600 || offset >= 24*3600 {

476 date = time.Unix(date.Unix(), 0).UTC()

477 }

478

479 subject := h.Get("Subject")

480 if s, err := wordDecoder.DecodeHeader(subject); err == nil {

481 subject = s

482 }

483

484 env := &Envelope{

485 date,

486 subject,

487 parseAddressList(log, h, "from"),

488 parseAddressList(log, h, "sender"),

489 parseAddressList(log, h, "reply-to"),

490 parseAddressList(log, h, "to"),

491 parseAddressList(log, h, "cc"),

492 parseAddressList(log, h, "bcc"),

493 h.Get("In-Reply-To"),

494 h.Get("Message-Id"),

495 }

496 return env, nil

497}

498

499func parseAddressList(log mlog.Log, h mail.Header, k string) []Address {

500 // todo: possibly work around ios mail generating incorrect q-encoded "phrases" with unencoded double quotes? ../rfc/2047:382

501 v := h.Get(k)

502 if v == "" {

503 return nil

504 }

505 parser := mail.AddressParser{WordDecoder: &wordDecoder}

506 l, err := parser.ParseList(v)

507 if err != nil {

508 return nil

509 }

510 var r []Address

511 for _, a := range l {

512 // todo: parse more fully according to ../rfc/5322:959

513 var user, host string

514 addr, err := smtp.ParseNetMailAddress(a.Address)

515 if err != nil {

516 log.Infox("parsing address (continuing)", err, slog.Any("netmailaddress", a.Address))

517 } else {

518 user = addr.Localpart.String()

519 host = addr.Domain.ASCII

520 }

521 r = append(r, Address{a.Name, user, host})

522 }

523 return r

524}

525

526// ParseNextPart parses the next (sub)part of this multipart message.

527// ParseNextPart returns io.EOF and a nil part when there are no more parts.

528// Only used for initial parsing of message. Once parsed, use p.Parts.

529func (p *Part) ParseNextPart(elog *slog.Logger) (*Part, error) {

530 log := mlog.New("message", elog)

531

532 if len(p.bound) == 0 {

533 return nil, errNotMultipart

534 }

535 if p.nextBoundOffset == -1 {

536 if enforceSequential {

537 panic("access not sequential")

538 }

539 // Set nextBoundOffset by fully reading the last part.

540 last, err := newPart(log, p.strict, p.r, p.lastBoundOffset, p)

541 if err != nil {

542 return nil, err

543 }

544 if _, err := io.Copy(io.Discard, last.RawReader()); err != nil {

545 return nil, err

546 }

547 if p.nextBoundOffset == -1 {

548 return nil, fmt.Errorf("internal error: reading part did not set nextBoundOffset")

549 }

550 }

551 b := &bufAt{strict: p.strict, r: p.r, offset: p.nextBoundOffset}

552 // todo: should we require a crlf on final closing bound? we don't require it because some message/rfc822 don't have a crlf after their closing boundary, so those messages don't end in crlf.

553 line, crlf, err := b.ReadLine(false)

554 if err != nil {

555 return nil, err

556 }

557 if match, finish := checkBound(line, p.bound); !match {

558 return nil, fmt.Errorf("expected bound, got %q", line)

559 } else if finish {

560 // Read any trailing data.

561 if p.parent != nil {

562 for {

563 line, _, err := b.PeekLine(false)

564 if err != nil {