Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions src/mailparser/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import base64
import email
import email.utils
import ipaddress
import json
import logging
Expand All @@ -29,6 +28,7 @@
convert_mail_date,
decode_header_part,
find_between,
get_addresses,
get_header,
get_mail_keys,
get_to_domains,
Expand Down Expand Up @@ -569,10 +569,17 @@ def __getattr__(self, name):
# object headers
elif name_header in ADDRESSES_HEADERS:
raw_header = self.message.get(name_header, "") if self.message else ""
# parse before decoding
parsed_addresses = email.utils.getaddresses([raw_header], strict=True)

# decoded addresses
# Parse addresses. RFC 5322 §3.4 does not allow unquoted "@" in
# display names, so a strict parser correctly rejects headers like
# From: alice@example.com <bob@example.com>
# and returns ('', ''). mail-parser is a security/forensics tool,
# not an MTA: hiding addresses from analysts is worse than accepting
# non-conforming input. get_addresses() applies a regex fallback
# when strict parsing yields only empty results — see its docstring
# in utils.py for the full rationale.
parsed_addresses = get_addresses(raw_header)

# decoded addresses — skip entries with no address (absent header)
return [
(
(
Expand All @@ -583,6 +590,7 @@ def __getattr__(self, name):
email_addr,
)
for name, email_addr in parsed_addresses
if email_addr
]

# others headers
Expand Down
82 changes: 82 additions & 0 deletions src/mailparser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,88 @@

log = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# RFC 5322 address parsing — fallback for non-compliant display names
# ---------------------------------------------------------------------------
# RFC 5322 §3.4 defines the display-name as a "phrase", which must not contain
# unquoted special characters such as "@". A header like
#
# From: alice@example.com <bob@example.com>
#
# is therefore *technically non-conforming*: the display name contains an
# unquoted "@". Python's ``email.utils.getaddresses`` with ``strict=True``
# (hardened against CVE-2023-27043) correctly rejects this and returns
# ``[('', '')]``, leaving the real address invisible.
#
# mail-parser is a security / forensics tool, not an MTA. Silently hiding an
# address because its display-name looks like an e-mail address defeats the
# purpose of the tool — analysts *need* to see those values. We therefore
# bypass strict compliance with a regex fallback whenever strict parsing yields
# an empty address, always surfacing the value that is actually in the header.
_ADDR_FALLBACK_RE = re.compile(
r'"([^"]*?)"\s*<([^>]+)>' # "Quoted Name" <email@addr>
r"|([^<,]*?)\s*<([^>]+)>" # Any Name <email@addr> (incl. email-as-name)
r"|([^\s,<>]+@[^\s,<>]+)" # bare email@addr
)


def get_addresses(raw_header):
"""
Parse email addresses from a raw address header with a fallback for
RFC-non-compliant but real-world-common formats.

RFC 5322 §3.4 requires the display name (phrase) before an angle-bracket
address to consist only of printable ASCII characters that are *not*
special. The ``@`` character is special, so a header such as::

From: alice@example.com <bob@example.com>

is technically non-conforming because the display name contains an
unquoted ``@``. Python's ``email.utils.getaddresses`` with
``strict=True`` (hardened against CVE-2023-27043) correctly returns
``[('', '')]`` for this input, making the real sender invisible.

mail-parser is a *security / forensics* tool, not an MTA. Silently
discarding an address because its display name happens to look like an
e-mail address would hide relevant forensic information from analysts —
the very opposite of what the tool is for. We therefore bypass strict
RFC compliance by applying a regex-based fallback whenever the strict
parser yields only empty addresses, so that analysts always see the value
that was actually present in the header.

Args:
raw_header (str): raw value of an address header
(e.g. ``From``, ``To``, ``CC`` …)

Returns:
list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples.
``display_name`` is an empty string when absent.
"""
parsed = email.utils.getaddresses([raw_header], strict=True)

# If every result from the strict parser has an empty address — while the
# raw header is non-empty — fall back to regex extraction so that the
# actual address values are not silently lost.
if raw_header.strip() and all(not addr for _, addr in parsed):
results = []
for m in _ADDR_FALLBACK_RE.finditer(raw_header):
if m.group(2): # "Quoted Name" <email>
results.append((m.group(1).strip(), m.group(2).strip()))
elif m.group(4): # Any Name <email> (incl. email-as-display-name)
results.append((m.group(3).strip(), m.group(4).strip()))
elif m.group(5): # bare email
results.append(("", m.group(5).strip()))
if results:
log.debug(
"Strict address parsing yielded empty results for %r; "
"regex fallback recovered %d address(es)",
raw_header,
len(results),
)
return results

return parsed


def custom_log(level="WARNING", name=None): # pragma: no cover
"""
Expand Down
12 changes: 12 additions & 0 deletions tests/mails/mail_test_19
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
From: alice@example.com <bob@example.com>
To: "Charlie Brown" <charlie@example.com>, dave@example.com
CC: eve@example.com <frank@example.com>
Reply-To: henry@example.com <ivan@example.com>
Subject: Test email with email address as display name
Message-ID: <test-email-as-name@example.com>
Date: Mon, 01 Jan 2024 12:00:00 +0000
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8

This email tests parsing of address headers where the display name is itself
an email address (RFC non-compliant but common in real-world mail).
161 changes: 161 additions & 0 deletions tests/test_mail_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from mailparser.utils import (
convert_mail_date,
fingerprints,
get_addresses,
get_header,
get_mail_keys,
get_to_domains,
Expand Down Expand Up @@ -62,6 +63,7 @@
mail_test_16 = os.path.join(base_path, "mails", "mail_test_16")
mail_test_17 = os.path.join(base_path, "mails", "mail_test_17")
mail_test_18 = os.path.join(base_path, "mails", "mail_test_18")
mail_test_19 = os.path.join(base_path, "mails", "mail_test_19")
mail_malformed_1 = os.path.join(base_path, "mails", "mail_malformed_1")
mail_malformed_2 = os.path.join(base_path, "mails", "mail_malformed_2")
mail_malformed_3 = os.path.join(base_path, "mails", "mail_malformed_3")
Expand Down Expand Up @@ -1084,3 +1086,162 @@ def test_unicode_decode_error_in_payload(self):
mail = mailparser.parse_from_string(raw_mail)
# Should have parsed successfully and body contains the text
self.assertIn("hello", mail.body)


class TestEmailAsDisplayName(unittest.TestCase):
"""
Tests for address parsing when the display name is itself an email address.

RFC 5322 §3.4 forbids unquoted "@" in the display-name phrase, so a header
like ``From: alice@example.com <bob@example.com>`` is technically
non-conforming. Python's strict parser (CVE-2023-27043 hardening) returns
``[('', '')]`` for such input, which would silently hide the real sender.

mail-parser is a security/forensics tool: it intentionally bypasses this
strict compliance and applies a regex fallback so that analysts always see
the address values that are actually present in the header.
"""

def test_from_email_as_display_name(self):
"""From header with an email address as display name is parsed correctly."""
mail = mailparser.parse_from_file(mail_test_19)
result = mail.from_
self.assertIsInstance(result, list)
self.assertEqual(len(result), 1)
name, addr = result[0]
self.assertEqual(addr, "bob@example.com")
self.assertEqual(name, "alice@example.com")

def test_cc_email_as_display_name(self):
"""CC header with an email address as display name is parsed correctly."""
mail = mailparser.parse_from_file(mail_test_19)
result = mail.cc
self.assertIsInstance(result, list)
self.assertEqual(len(result), 1)
name, addr = result[0]
self.assertEqual(addr, "frank@example.com")
self.assertEqual(name, "eve@example.com")

def test_reply_to_email_as_display_name(self):
"""Reply-To header with an email address as display name is parsed correctly."""
mail = mailparser.parse_from_file(mail_test_19)
result = mail.reply_to
self.assertIsInstance(result, list)
self.assertEqual(len(result), 1)
name, addr = result[0]
self.assertEqual(addr, "ivan@example.com")
self.assertEqual(name, "henry@example.com")

def test_to_mixed_addresses(self):
"""To header with a mix of quoted name and bare address is parsed correctly."""
mail = mailparser.parse_from_file(mail_test_19)
result = mail.to
self.assertIsInstance(result, list)
self.assertEqual(len(result), 2)
# "Charlie Brown" <charlie@example.com>
name0, addr0 = result[0]
self.assertEqual(addr0, "charlie@example.com")
self.assertEqual(name0, "Charlie Brown")
# dave@example.com (bare address, no display name)
name1, addr1 = result[1]
self.assertEqual(addr1, "dave@example.com")
self.assertEqual(name1, "")

# ------------------------------------------------------------------
# Edge-case tests via parse_from_string (no additional mail files needed)
# ------------------------------------------------------------------

def test_same_email_as_name_and_address_suppresses_name(self):
"""When display name == address, name is suppressed to empty string.

This covers the case ``From: bob@example.com <bob@example.com>`` which
is both RFC non-compliant (unquoted @) AND redundant. After the regex
fallback recovers the address, the existing name-suppression logic
(decoded_name == email_addr → "") must still fire correctly.
"""
mail = mailparser.parse_from_string(
"From: bob@example.com <bob@example.com>\nSubject: x\n\nBody"
)
result = mail.from_
self.assertEqual(len(result), 1)
name, addr = result[0]
self.assertEqual(addr, "bob@example.com")
self.assertEqual(name, "")

def test_quoted_email_as_display_name(self):
"""Properly quoted email-as-name (RFC-compliant) is parsed by strict parser."""
mail = mailparser.parse_from_string(
'From: "alice@example.com" <bob@example.com>\nSubject: x\n\nBody'
)
result = mail.from_
self.assertEqual(len(result), 1)
name, addr = result[0]
self.assertEqual(addr, "bob@example.com")
self.assertEqual(name, "alice@example.com")

def test_standard_display_name_unchanged(self):
"""Standard ``Name <email>`` format still works correctly (no regression)."""
mail = mailparser.parse_from_string(
"From: Alice Smith <alice@example.com>\nSubject: x\n\nBody"
)
result = mail.from_
self.assertEqual(len(result), 1)
name, addr = result[0]
self.assertEqual(addr, "alice@example.com")
self.assertEqual(name, "Alice Smith")

def test_bare_address_no_display_name(self):
"""Bare address with no display name returns empty name (no regression)."""
mail = mailparser.parse_from_string(
"From: alice@example.com\nSubject: x\n\nBody"
)
result = mail.from_
self.assertEqual(len(result), 1)
name, addr = result[0]
self.assertEqual(addr, "alice@example.com")
self.assertEqual(name, "")

def test_empty_header_returns_empty_list(self):
"""A missing address header returns [] — absent headers must not appear."""
mail = mailparser.parse_from_string("Subject: x\n\nBody")
# Python's getaddresses("") yields [('', '')], but we filter out entries
# with an empty address so that absent headers are not included in the
# parsed mail object.
self.assertEqual(mail.from_, [])

# ------------------------------------------------------------------
# Unit tests for get_addresses() helper directly
# ------------------------------------------------------------------

def test_get_addresses_email_as_name(self):
"""get_addresses() fallback recovers address when display name is an email."""
result = get_addresses("alice@example.com <bob@example.com>")
self.assertEqual(result, [("alice@example.com", "bob@example.com")])

def test_get_addresses_standard_format(self):
"""get_addresses() strict path handles normal ``Name <email>`` correctly."""
result = get_addresses("Alice Smith <alice@example.com>")
self.assertEqual(result, [("Alice Smith", "alice@example.com")])

def test_get_addresses_bare_email(self):
"""get_addresses() handles bare email address with no display name."""
result = get_addresses("alice@example.com")
self.assertEqual(result, [("", "alice@example.com")])

def test_get_addresses_empty_header(self):
"""get_addresses() on empty string returns [('', '')] — raw Python lib result.

The ('', '') entry is filtered out in __getattr__ (core.py) so that
absent headers do not appear in the parsed mail output.
"""
result = get_addresses("")
self.assertEqual(result, [("", "")])

def test_get_addresses_multiple_with_email_as_name(self):
"""get_addresses() fallback handles multiple addresses when all fail strict."""
result = get_addresses(
"alice@example.com <bob@example.com>, eve@example.com <frank@example.com>"
)
self.assertEqual(len(result), 2)
self.assertEqual(result[0], ("alice@example.com", "bob@example.com"))
self.assertEqual(result[1], ("eve@example.com", "frank@example.com"))
Loading