diff --git a/src/mailparser/core.py b/src/mailparser/core.py index 10901af..aa61e9b 100644 --- a/src/mailparser/core.py +++ b/src/mailparser/core.py @@ -18,7 +18,6 @@ import base64 import email -import email.utils import ipaddress import json import logging @@ -29,6 +28,7 @@ convert_mail_date, decode_header_part, find_between, + get_addresses, get_header, get_mail_keys, get_to_domains, @@ -569,10 +569,17 @@ def __getattr__(self, name): # object headers elif name_header in ADDRESSES_HEADERS: raw_header = self.message.get(name_header, "") if self.message else "" - # parse before decoding - parsed_addresses = email.utils.getaddresses([raw_header], strict=True) - - # decoded addresses + # Parse addresses. RFC 5322 §3.4 does not allow unquoted "@" in + # display names, so a strict parser correctly rejects headers like + # From: alice@example.com + # and returns ('', ''). mail-parser is a security/forensics tool, + # not an MTA: hiding addresses from analysts is worse than accepting + # non-conforming input. get_addresses() applies a regex fallback + # when strict parsing yields only empty results — see its docstring + # in utils.py for the full rationale. + parsed_addresses = get_addresses(raw_header) + + # decoded addresses — skip entries with no address (absent header) return [ ( ( @@ -583,6 +590,7 @@ def __getattr__(self, name): email_addr, ) for name, email_addr in parsed_addresses + if email_addr ] # others headers diff --git a/src/mailparser/utils.py b/src/mailparser/utils.py index dbc6f96..ff4e725 100644 --- a/src/mailparser/utils.py +++ b/src/mailparser/utils.py @@ -50,6 +50,88 @@ log = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# RFC 5322 address parsing — fallback for non-compliant display names +# --------------------------------------------------------------------------- +# RFC 5322 §3.4 defines the display-name as a "phrase", which must not contain +# unquoted special characters such as "@". A header like +# +# From: alice@example.com +# +# is therefore *technically non-conforming*: the display name contains an +# unquoted "@". Python's ``email.utils.getaddresses`` with ``strict=True`` +# (hardened against CVE-2023-27043) correctly rejects this and returns +# ``[('', '')]``, leaving the real address invisible. +# +# mail-parser is a security / forensics tool, not an MTA. Silently hiding an +# address because its display-name looks like an e-mail address defeats the +# purpose of the tool — analysts *need* to see those values. We therefore +# bypass strict compliance with a regex fallback whenever strict parsing yields +# an empty address, always surfacing the value that is actually in the header. +_ADDR_FALLBACK_RE = re.compile( + r'"([^"]*?)"\s*<([^>]+)>' # "Quoted Name" + r"|([^<,]*?)\s*<([^>]+)>" # Any Name (incl. email-as-name) + r"|([^\s,<>]+@[^\s,<>]+)" # bare email@addr +) + + +def get_addresses(raw_header): + """ + Parse email addresses from a raw address header with a fallback for + RFC-non-compliant but real-world-common formats. + + RFC 5322 §3.4 requires the display name (phrase) before an angle-bracket + address to consist only of printable ASCII characters that are *not* + special. The ``@`` character is special, so a header such as:: + + From: alice@example.com + + is technically non-conforming because the display name contains an + unquoted ``@``. Python's ``email.utils.getaddresses`` with + ``strict=True`` (hardened against CVE-2023-27043) correctly returns + ``[('', '')]`` for this input, making the real sender invisible. + + mail-parser is a *security / forensics* tool, not an MTA. Silently + discarding an address because its display name happens to look like an + e-mail address would hide relevant forensic information from analysts — + the very opposite of what the tool is for. We therefore bypass strict + RFC compliance by applying a regex-based fallback whenever the strict + parser yields only empty addresses, so that analysts always see the value + that was actually present in the header. + + Args: + raw_header (str): raw value of an address header + (e.g. ``From``, ``To``, ``CC`` …) + + Returns: + list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples. + ``display_name`` is an empty string when absent. + """ + parsed = email.utils.getaddresses([raw_header], strict=True) + + # If every result from the strict parser has an empty address — while the + # raw header is non-empty — fall back to regex extraction so that the + # actual address values are not silently lost. + if raw_header.strip() and all(not addr for _, addr in parsed): + results = [] + for m in _ADDR_FALLBACK_RE.finditer(raw_header): + if m.group(2): # "Quoted Name" + results.append((m.group(1).strip(), m.group(2).strip())) + elif m.group(4): # Any Name (incl. email-as-display-name) + results.append((m.group(3).strip(), m.group(4).strip())) + elif m.group(5): # bare email + results.append(("", m.group(5).strip())) + if results: + log.debug( + "Strict address parsing yielded empty results for %r; " + "regex fallback recovered %d address(es)", + raw_header, + len(results), + ) + return results + + return parsed + def custom_log(level="WARNING", name=None): # pragma: no cover """ diff --git a/tests/mails/mail_test_19 b/tests/mails/mail_test_19 new file mode 100644 index 0000000..9011236 --- /dev/null +++ b/tests/mails/mail_test_19 @@ -0,0 +1,12 @@ +From: alice@example.com +To: "Charlie Brown" , dave@example.com +CC: eve@example.com +Reply-To: henry@example.com +Subject: Test email with email address as display name +Message-ID: +Date: Mon, 01 Jan 2024 12:00:00 +0000 +MIME-Version: 1.0 +Content-Type: text/plain; charset=utf-8 + +This email tests parsing of address headers where the display name is itself +an email address (RFC non-compliant but common in real-world mail). diff --git a/tests/test_mail_parser.py b/tests/test_mail_parser.py index 10c35a9..b7d4a56 100644 --- a/tests/test_mail_parser.py +++ b/tests/test_mail_parser.py @@ -29,6 +29,7 @@ from mailparser.utils import ( convert_mail_date, fingerprints, + get_addresses, get_header, get_mail_keys, get_to_domains, @@ -62,6 +63,7 @@ mail_test_16 = os.path.join(base_path, "mails", "mail_test_16") mail_test_17 = os.path.join(base_path, "mails", "mail_test_17") mail_test_18 = os.path.join(base_path, "mails", "mail_test_18") +mail_test_19 = os.path.join(base_path, "mails", "mail_test_19") mail_malformed_1 = os.path.join(base_path, "mails", "mail_malformed_1") mail_malformed_2 = os.path.join(base_path, "mails", "mail_malformed_2") mail_malformed_3 = os.path.join(base_path, "mails", "mail_malformed_3") @@ -1084,3 +1086,162 @@ def test_unicode_decode_error_in_payload(self): mail = mailparser.parse_from_string(raw_mail) # Should have parsed successfully and body contains the text self.assertIn("hello", mail.body) + + +class TestEmailAsDisplayName(unittest.TestCase): + """ + Tests for address parsing when the display name is itself an email address. + + RFC 5322 §3.4 forbids unquoted "@" in the display-name phrase, so a header + like ``From: alice@example.com `` is technically + non-conforming. Python's strict parser (CVE-2023-27043 hardening) returns + ``[('', '')]`` for such input, which would silently hide the real sender. + + mail-parser is a security/forensics tool: it intentionally bypasses this + strict compliance and applies a regex fallback so that analysts always see + the address values that are actually present in the header. + """ + + def test_from_email_as_display_name(self): + """From header with an email address as display name is parsed correctly.""" + mail = mailparser.parse_from_file(mail_test_19) + result = mail.from_ + self.assertIsInstance(result, list) + self.assertEqual(len(result), 1) + name, addr = result[0] + self.assertEqual(addr, "bob@example.com") + self.assertEqual(name, "alice@example.com") + + def test_cc_email_as_display_name(self): + """CC header with an email address as display name is parsed correctly.""" + mail = mailparser.parse_from_file(mail_test_19) + result = mail.cc + self.assertIsInstance(result, list) + self.assertEqual(len(result), 1) + name, addr = result[0] + self.assertEqual(addr, "frank@example.com") + self.assertEqual(name, "eve@example.com") + + def test_reply_to_email_as_display_name(self): + """Reply-To header with an email address as display name is parsed correctly.""" + mail = mailparser.parse_from_file(mail_test_19) + result = mail.reply_to + self.assertIsInstance(result, list) + self.assertEqual(len(result), 1) + name, addr = result[0] + self.assertEqual(addr, "ivan@example.com") + self.assertEqual(name, "henry@example.com") + + def test_to_mixed_addresses(self): + """To header with a mix of quoted name and bare address is parsed correctly.""" + mail = mailparser.parse_from_file(mail_test_19) + result = mail.to + self.assertIsInstance(result, list) + self.assertEqual(len(result), 2) + # "Charlie Brown" + name0, addr0 = result[0] + self.assertEqual(addr0, "charlie@example.com") + self.assertEqual(name0, "Charlie Brown") + # dave@example.com (bare address, no display name) + name1, addr1 = result[1] + self.assertEqual(addr1, "dave@example.com") + self.assertEqual(name1, "") + + # ------------------------------------------------------------------ + # Edge-case tests via parse_from_string (no additional mail files needed) + # ------------------------------------------------------------------ + + def test_same_email_as_name_and_address_suppresses_name(self): + """When display name == address, name is suppressed to empty string. + + This covers the case ``From: bob@example.com `` which + is both RFC non-compliant (unquoted @) AND redundant. After the regex + fallback recovers the address, the existing name-suppression logic + (decoded_name == email_addr → "") must still fire correctly. + """ + mail = mailparser.parse_from_string( + "From: bob@example.com \nSubject: x\n\nBody" + ) + result = mail.from_ + self.assertEqual(len(result), 1) + name, addr = result[0] + self.assertEqual(addr, "bob@example.com") + self.assertEqual(name, "") + + def test_quoted_email_as_display_name(self): + """Properly quoted email-as-name (RFC-compliant) is parsed by strict parser.""" + mail = mailparser.parse_from_string( + 'From: "alice@example.com" \nSubject: x\n\nBody' + ) + result = mail.from_ + self.assertEqual(len(result), 1) + name, addr = result[0] + self.assertEqual(addr, "bob@example.com") + self.assertEqual(name, "alice@example.com") + + def test_standard_display_name_unchanged(self): + """Standard ``Name `` format still works correctly (no regression).""" + mail = mailparser.parse_from_string( + "From: Alice Smith \nSubject: x\n\nBody" + ) + result = mail.from_ + self.assertEqual(len(result), 1) + name, addr = result[0] + self.assertEqual(addr, "alice@example.com") + self.assertEqual(name, "Alice Smith") + + def test_bare_address_no_display_name(self): + """Bare address with no display name returns empty name (no regression).""" + mail = mailparser.parse_from_string( + "From: alice@example.com\nSubject: x\n\nBody" + ) + result = mail.from_ + self.assertEqual(len(result), 1) + name, addr = result[0] + self.assertEqual(addr, "alice@example.com") + self.assertEqual(name, "") + + def test_empty_header_returns_empty_list(self): + """A missing address header returns [] — absent headers must not appear.""" + mail = mailparser.parse_from_string("Subject: x\n\nBody") + # Python's getaddresses("") yields [('', '')], but we filter out entries + # with an empty address so that absent headers are not included in the + # parsed mail object. + self.assertEqual(mail.from_, []) + + # ------------------------------------------------------------------ + # Unit tests for get_addresses() helper directly + # ------------------------------------------------------------------ + + def test_get_addresses_email_as_name(self): + """get_addresses() fallback recovers address when display name is an email.""" + result = get_addresses("alice@example.com ") + self.assertEqual(result, [("alice@example.com", "bob@example.com")]) + + def test_get_addresses_standard_format(self): + """get_addresses() strict path handles normal ``Name `` correctly.""" + result = get_addresses("Alice Smith ") + self.assertEqual(result, [("Alice Smith", "alice@example.com")]) + + def test_get_addresses_bare_email(self): + """get_addresses() handles bare email address with no display name.""" + result = get_addresses("alice@example.com") + self.assertEqual(result, [("", "alice@example.com")]) + + def test_get_addresses_empty_header(self): + """get_addresses() on empty string returns [('', '')] — raw Python lib result. + + The ('', '') entry is filtered out in __getattr__ (core.py) so that + absent headers do not appear in the parsed mail output. + """ + result = get_addresses("") + self.assertEqual(result, [("", "")]) + + def test_get_addresses_multiple_with_email_as_name(self): + """get_addresses() fallback handles multiple addresses when all fail strict.""" + result = get_addresses( + "alice@example.com , eve@example.com " + ) + self.assertEqual(len(result), 2) + self.assertEqual(result[0], ("alice@example.com", "bob@example.com")) + self.assertEqual(result[1], ("eve@example.com", "frank@example.com"))