Skip to content

Commit e18e7c9

Browse files
Fixed issue #132 and #133 (#151)
* Issue #132 * fix: Update address parsing to skip empty entries from absent headers
1 parent f6df398 commit e18e7c9

4 files changed

Lines changed: 268 additions & 5 deletions

File tree

src/mailparser/core.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
import base64
2020
import email
21-
import email.utils
2221
import ipaddress
2322
import json
2423
import logging
@@ -29,6 +28,7 @@
2928
convert_mail_date,
3029
decode_header_part,
3130
find_between,
31+
get_addresses,
3232
get_header,
3333
get_mail_keys,
3434
get_to_domains,
@@ -569,10 +569,17 @@ def __getattr__(self, name):
569569
# object headers
570570
elif name_header in ADDRESSES_HEADERS:
571571
raw_header = self.message.get(name_header, "") if self.message else ""
572-
# parse before decoding
573-
parsed_addresses = email.utils.getaddresses([raw_header], strict=True)
574-
575-
# decoded addresses
572+
# Parse addresses. RFC 5322 §3.4 does not allow unquoted "@" in
573+
# display names, so a strict parser correctly rejects headers like
574+
# From: alice@example.com <bob@example.com>
575+
# and returns ('', ''). mail-parser is a security/forensics tool,
576+
# not an MTA: hiding addresses from analysts is worse than accepting
577+
# non-conforming input. get_addresses() applies a regex fallback
578+
# when strict parsing yields only empty results — see its docstring
579+
# in utils.py for the full rationale.
580+
parsed_addresses = get_addresses(raw_header)
581+
582+
# decoded addresses — skip entries with no address (absent header)
576583
return [
577584
(
578585
(
@@ -583,6 +590,7 @@ def __getattr__(self, name):
583590
email_addr,
584591
)
585592
for name, email_addr in parsed_addresses
593+
if email_addr
586594
]
587595

588596
# others headers

src/mailparser/utils.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,88 @@
5050

5151
log = logging.getLogger(__name__)
5252

53+
# ---------------------------------------------------------------------------
54+
# RFC 5322 address parsing — fallback for non-compliant display names
55+
# ---------------------------------------------------------------------------
56+
# RFC 5322 §3.4 defines the display-name as a "phrase", which must not contain
57+
# unquoted special characters such as "@". A header like
58+
#
59+
# From: alice@example.com <bob@example.com>
60+
#
61+
# is therefore *technically non-conforming*: the display name contains an
62+
# unquoted "@". Python's ``email.utils.getaddresses`` with ``strict=True``
63+
# (hardened against CVE-2023-27043) correctly rejects this and returns
64+
# ``[('', '')]``, leaving the real address invisible.
65+
#
66+
# mail-parser is a security / forensics tool, not an MTA. Silently hiding an
67+
# address because its display-name looks like an e-mail address defeats the
68+
# purpose of the tool — analysts *need* to see those values. We therefore
69+
# bypass strict compliance with a regex fallback whenever strict parsing yields
70+
# an empty address, always surfacing the value that is actually in the header.
71+
_ADDR_FALLBACK_RE = re.compile(
72+
r'"([^"]*?)"\s*<([^>]+)>' # "Quoted Name" <email@addr>
73+
r"|([^<,]*?)\s*<([^>]+)>" # Any Name <email@addr> (incl. email-as-name)
74+
r"|([^\s,<>]+@[^\s,<>]+)" # bare email@addr
75+
)
76+
77+
78+
def get_addresses(raw_header):
79+
"""
80+
Parse email addresses from a raw address header with a fallback for
81+
RFC-non-compliant but real-world-common formats.
82+
83+
RFC 5322 §3.4 requires the display name (phrase) before an angle-bracket
84+
address to consist only of printable ASCII characters that are *not*
85+
special. The ``@`` character is special, so a header such as::
86+
87+
From: alice@example.com <bob@example.com>
88+
89+
is technically non-conforming because the display name contains an
90+
unquoted ``@``. Python's ``email.utils.getaddresses`` with
91+
``strict=True`` (hardened against CVE-2023-27043) correctly returns
92+
``[('', '')]`` for this input, making the real sender invisible.
93+
94+
mail-parser is a *security / forensics* tool, not an MTA. Silently
95+
discarding an address because its display name happens to look like an
96+
e-mail address would hide relevant forensic information from analysts —
97+
the very opposite of what the tool is for. We therefore bypass strict
98+
RFC compliance by applying a regex-based fallback whenever the strict
99+
parser yields only empty addresses, so that analysts always see the value
100+
that was actually present in the header.
101+
102+
Args:
103+
raw_header (str): raw value of an address header
104+
(e.g. ``From``, ``To``, ``CC`` …)
105+
106+
Returns:
107+
list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples.
108+
``display_name`` is an empty string when absent.
109+
"""
110+
parsed = email.utils.getaddresses([raw_header], strict=True)
111+
112+
# If every result from the strict parser has an empty address — while the
113+
# raw header is non-empty — fall back to regex extraction so that the
114+
# actual address values are not silently lost.
115+
if raw_header.strip() and all(not addr for _, addr in parsed):
116+
results = []
117+
for m in _ADDR_FALLBACK_RE.finditer(raw_header):
118+
if m.group(2): # "Quoted Name" <email>
119+
results.append((m.group(1).strip(), m.group(2).strip()))
120+
elif m.group(4): # Any Name <email> (incl. email-as-display-name)
121+
results.append((m.group(3).strip(), m.group(4).strip()))
122+
elif m.group(5): # bare email
123+
results.append(("", m.group(5).strip()))
124+
if results:
125+
log.debug(
126+
"Strict address parsing yielded empty results for %r; "
127+
"regex fallback recovered %d address(es)",
128+
raw_header,
129+
len(results),
130+
)
131+
return results
132+
133+
return parsed
134+
53135

54136
def custom_log(level="WARNING", name=None): # pragma: no cover
55137
"""

tests/mails/mail_test_19

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
From: alice@example.com <bob@example.com>
2+
To: "Charlie Brown" <charlie@example.com>, dave@example.com
3+
CC: eve@example.com <frank@example.com>
4+
Reply-To: henry@example.com <ivan@example.com>
5+
Subject: Test email with email address as display name
6+
Message-ID: <test-email-as-name@example.com>
7+
Date: Mon, 01 Jan 2024 12:00:00 +0000
8+
MIME-Version: 1.0
9+
Content-Type: text/plain; charset=utf-8
10+
11+
This email tests parsing of address headers where the display name is itself
12+
an email address (RFC non-compliant but common in real-world mail).

tests/test_mail_parser.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from mailparser.utils import (
3030
convert_mail_date,
3131
fingerprints,
32+
get_addresses,
3233
get_header,
3334
get_mail_keys,
3435
get_to_domains,
@@ -62,6 +63,7 @@
6263
mail_test_16 = os.path.join(base_path, "mails", "mail_test_16")
6364
mail_test_17 = os.path.join(base_path, "mails", "mail_test_17")
6465
mail_test_18 = os.path.join(base_path, "mails", "mail_test_18")
66+
mail_test_19 = os.path.join(base_path, "mails", "mail_test_19")
6567
mail_malformed_1 = os.path.join(base_path, "mails", "mail_malformed_1")
6668
mail_malformed_2 = os.path.join(base_path, "mails", "mail_malformed_2")
6769
mail_malformed_3 = os.path.join(base_path, "mails", "mail_malformed_3")
@@ -1084,3 +1086,162 @@ def test_unicode_decode_error_in_payload(self):
10841086
mail = mailparser.parse_from_string(raw_mail)
10851087
# Should have parsed successfully and body contains the text
10861088
self.assertIn("hello", mail.body)
1089+
1090+
1091+
class TestEmailAsDisplayName(unittest.TestCase):
1092+
"""
1093+
Tests for address parsing when the display name is itself an email address.
1094+
1095+
RFC 5322 §3.4 forbids unquoted "@" in the display-name phrase, so a header
1096+
like ``From: alice@example.com <bob@example.com>`` is technically
1097+
non-conforming. Python's strict parser (CVE-2023-27043 hardening) returns
1098+
``[('', '')]`` for such input, which would silently hide the real sender.
1099+
1100+
mail-parser is a security/forensics tool: it intentionally bypasses this
1101+
strict compliance and applies a regex fallback so that analysts always see
1102+
the address values that are actually present in the header.
1103+
"""
1104+
1105+
def test_from_email_as_display_name(self):
1106+
"""From header with an email address as display name is parsed correctly."""
1107+
mail = mailparser.parse_from_file(mail_test_19)
1108+
result = mail.from_
1109+
self.assertIsInstance(result, list)
1110+
self.assertEqual(len(result), 1)
1111+
name, addr = result[0]
1112+
self.assertEqual(addr, "bob@example.com")
1113+
self.assertEqual(name, "alice@example.com")
1114+
1115+
def test_cc_email_as_display_name(self):
1116+
"""CC header with an email address as display name is parsed correctly."""
1117+
mail = mailparser.parse_from_file(mail_test_19)
1118+
result = mail.cc
1119+
self.assertIsInstance(result, list)
1120+
self.assertEqual(len(result), 1)
1121+
name, addr = result[0]
1122+
self.assertEqual(addr, "frank@example.com")
1123+
self.assertEqual(name, "eve@example.com")
1124+
1125+
def test_reply_to_email_as_display_name(self):
1126+
"""Reply-To header with an email address as display name is parsed correctly."""
1127+
mail = mailparser.parse_from_file(mail_test_19)
1128+
result = mail.reply_to
1129+
self.assertIsInstance(result, list)
1130+
self.assertEqual(len(result), 1)
1131+
name, addr = result[0]
1132+
self.assertEqual(addr, "ivan@example.com")
1133+
self.assertEqual(name, "henry@example.com")
1134+
1135+
def test_to_mixed_addresses(self):
1136+
"""To header with a mix of quoted name and bare address is parsed correctly."""
1137+
mail = mailparser.parse_from_file(mail_test_19)
1138+
result = mail.to
1139+
self.assertIsInstance(result, list)
1140+
self.assertEqual(len(result), 2)
1141+
# "Charlie Brown" <charlie@example.com>
1142+
name0, addr0 = result[0]
1143+
self.assertEqual(addr0, "charlie@example.com")
1144+
self.assertEqual(name0, "Charlie Brown")
1145+
# dave@example.com (bare address, no display name)
1146+
name1, addr1 = result[1]
1147+
self.assertEqual(addr1, "dave@example.com")
1148+
self.assertEqual(name1, "")
1149+
1150+
# ------------------------------------------------------------------
1151+
# Edge-case tests via parse_from_string (no additional mail files needed)
1152+
# ------------------------------------------------------------------
1153+
1154+
def test_same_email_as_name_and_address_suppresses_name(self):
1155+
"""When display name == address, name is suppressed to empty string.
1156+
1157+
This covers the case ``From: bob@example.com <bob@example.com>`` which
1158+
is both RFC non-compliant (unquoted @) AND redundant. After the regex
1159+
fallback recovers the address, the existing name-suppression logic
1160+
(decoded_name == email_addr → "") must still fire correctly.
1161+
"""
1162+
mail = mailparser.parse_from_string(
1163+
"From: bob@example.com <bob@example.com>\nSubject: x\n\nBody"
1164+
)
1165+
result = mail.from_
1166+
self.assertEqual(len(result), 1)
1167+
name, addr = result[0]
1168+
self.assertEqual(addr, "bob@example.com")
1169+
self.assertEqual(name, "")
1170+
1171+
def test_quoted_email_as_display_name(self):
1172+
"""Properly quoted email-as-name (RFC-compliant) is parsed by strict parser."""
1173+
mail = mailparser.parse_from_string(
1174+
'From: "alice@example.com" <bob@example.com>\nSubject: x\n\nBody'
1175+
)
1176+
result = mail.from_
1177+
self.assertEqual(len(result), 1)
1178+
name, addr = result[0]
1179+
self.assertEqual(addr, "bob@example.com")
1180+
self.assertEqual(name, "alice@example.com")
1181+
1182+
def test_standard_display_name_unchanged(self):
1183+
"""Standard ``Name <email>`` format still works correctly (no regression)."""
1184+
mail = mailparser.parse_from_string(
1185+
"From: Alice Smith <alice@example.com>\nSubject: x\n\nBody"
1186+
)
1187+
result = mail.from_
1188+
self.assertEqual(len(result), 1)
1189+
name, addr = result[0]
1190+
self.assertEqual(addr, "alice@example.com")
1191+
self.assertEqual(name, "Alice Smith")
1192+
1193+
def test_bare_address_no_display_name(self):
1194+
"""Bare address with no display name returns empty name (no regression)."""
1195+
mail = mailparser.parse_from_string(
1196+
"From: alice@example.com\nSubject: x\n\nBody"
1197+
)
1198+
result = mail.from_
1199+
self.assertEqual(len(result), 1)
1200+
name, addr = result[0]
1201+
self.assertEqual(addr, "alice@example.com")
1202+
self.assertEqual(name, "")
1203+
1204+
def test_empty_header_returns_empty_list(self):
1205+
"""A missing address header returns [] — absent headers must not appear."""
1206+
mail = mailparser.parse_from_string("Subject: x\n\nBody")
1207+
# Python's getaddresses("") yields [('', '')], but we filter out entries
1208+
# with an empty address so that absent headers are not included in the
1209+
# parsed mail object.
1210+
self.assertEqual(mail.from_, [])
1211+
1212+
# ------------------------------------------------------------------
1213+
# Unit tests for get_addresses() helper directly
1214+
# ------------------------------------------------------------------
1215+
1216+
def test_get_addresses_email_as_name(self):
1217+
"""get_addresses() fallback recovers address when display name is an email."""
1218+
result = get_addresses("alice@example.com <bob@example.com>")
1219+
self.assertEqual(result, [("alice@example.com", "bob@example.com")])
1220+
1221+
def test_get_addresses_standard_format(self):
1222+
"""get_addresses() strict path handles normal ``Name <email>`` correctly."""
1223+
result = get_addresses("Alice Smith <alice@example.com>")
1224+
self.assertEqual(result, [("Alice Smith", "alice@example.com")])
1225+
1226+
def test_get_addresses_bare_email(self):
1227+
"""get_addresses() handles bare email address with no display name."""
1228+
result = get_addresses("alice@example.com")
1229+
self.assertEqual(result, [("", "alice@example.com")])
1230+
1231+
def test_get_addresses_empty_header(self):
1232+
"""get_addresses() on empty string returns [('', '')] — raw Python lib result.
1233+
1234+
The ('', '') entry is filtered out in __getattr__ (core.py) so that
1235+
absent headers do not appear in the parsed mail output.
1236+
"""
1237+
result = get_addresses("")
1238+
self.assertEqual(result, [("", "")])
1239+
1240+
def test_get_addresses_multiple_with_email_as_name(self):
1241+
"""get_addresses() fallback handles multiple addresses when all fail strict."""
1242+
result = get_addresses(
1243+
"alice@example.com <bob@example.com>, eve@example.com <frank@example.com>"
1244+
)
1245+
self.assertEqual(len(result), 2)
1246+
self.assertEqual(result[0], ("alice@example.com", "bob@example.com"))
1247+
self.assertEqual(result[1], ("eve@example.com", "frank@example.com"))

0 commit comments

Comments
 (0)