Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .markdownlint.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"default": true,
"MD013": {
"line_length": 120,
"code_blocks": false,
"tables": false
},
"MD024": {
"siblings_only": true
},
"MD033": {
"allowed_elements": ["br", "img", "a", "details", "summary"]
},
"MD041": false
}
20 changes: 0 additions & 20 deletions .markdownlint.yaml

This file was deleted.

30 changes: 25 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,29 @@ repos:
# Run the formatter.
- id: ruff-format

- repo: https://github.com/igorshubovych/markdownlint-cli
rev: v0.45.0
- repo: https://github.com/executablebooks/mdformat
rev: 0.7.17
hooks:
- id: markdownlint
args: ['--fix']
exclude: '^\.github/instructions/'
- id: mdformat
exclude: '^\.github/'
additional_dependencies:
- mdformat-frontmatter
- mdformat-gfm
- mdformat-tables

- repo: https://github.com/DavidAnson/markdownlint-cli2
rev: v0.18.1
hooks:
- id: markdownlint-cli2
args: ['--config', '.markdownlint.json']
exclude: '^\.github/'

- repo: local
hooks:
- id: pyright
name: pyright
entry: uv run npx pyright
args: [src/, tests/]
language: system
pass_filenames: false
types: [python]
120 changes: 60 additions & 60 deletions src/mailparser/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,66 +18,66 @@

import re

REGXIP = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

JUNK_PATTERN = r"[ \(\)\[\]\t\n]+"

# Patterns for receiveds
RECEIVED_PATTERNS = [
# FIXED: More restrictive 'from' clause
# Only matches 'from' at the beginning of the header (^) or after
# newline/whitespace to avoid matching within "for <email> from <email>"
# constructs which caused duplicate matches in IBM gateway headers
(
r"(?:(?:^|\n\s*)from\s+(?P<from>.+?)(?:\s*[(]?"
r"envelope-from|\s*[(]?envelope-sender|\s+"
r"by|\s+with(?! cipher)|\s+id|\s+via|;))"
),
# IMPROVED: More precise 'by' clause
# Modified to not consume 'with' clause, allowing proper separation
# of 'by' (server name) and 'with' (protocol) fields
(
r"(?:(?:^|\s)by\s+(?P<by>[^\s]+(?:\s+[^\s]+)*?)"
r"(?:\s+with(?! cipher)|\s*[(]?envelope-from|\s*"
r"[(]?envelope-sender|\s+id|\s+for|\s+via|;))"
),
# IMPROVED: 'with' clause with better boundary detection
(
r"(?:(?:^|\s)with(?! cipher)\s+(?P<with>.+?)"
r"(?:\s*[(]?envelope-from|\s*[(]?"
r"envelope-sender|\s+id|\s+for|\s+via|;))"
),
# IMPROVED: 'id' clause with cleaner boundaries
(
r"(?:(?:^|\s)id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*"
r"[(]?envelope-sender|\s+for|\s+via|;))"
),
# IMPROVED: 'for' clause - handles "for <email> from <email>" pattern
# Stops before 'from' keyword to prevent the 'from' pattern from
# matching the sender email in this construct
(
r"(?:(?:^|\s)for\s+(?P<for><[^>]+>|[^\s]+)"
r"(?:\s+from|\s*[(]?envelope-from|\s*[(]?"
r"envelope-sender|\s+via|;))"
),
# IMPROVED: 'via' clause with better termination
(
r"(?:(?:^|\s)via\s+(?P<via>.+?)(?:\s*[(]?"
r"envelope-from|\s*[(]?envelope-sender|;))"
),
# assumes emails are always inside <>
r"(?:envelope-from\s+<(?P<envelope_from>.+?)>)",
r"(?:envelope-sender\s+<(?P<envelope_sender>.+?)>)",
# datetime comes after ; at the end
r";\s*(?P<date>.*)",
# sendgrid datetime
(
r"(?P<date>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:"
r"\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+"
),
]

RECEIVED_COMPILED_LIST = [re.compile(i, re.I | re.DOTALL) for i in RECEIVED_PATTERNS]
# IPv4 pattern - validates octet range (0-255) per RFC 791
REGXIP = re.compile(
r"(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)"
)

# IPv6 pattern - matches standard and common compressed forms per RFC 5952
REGXIP6 = re.compile(
r"(?:(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}" # full form
r"|(?:[0-9a-fA-F]{1,4}:){1,7}:" # trailing ::
r"|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}" # :: with 1 group after
r"|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}"
r"|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}"
r"|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}"
r"|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}"
r"|[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}"
r"|:(?::[0-9a-fA-F]{1,4}){1,7}" # ::x:x...
r"|::)" # just ::
)

# Normalize whitespace: collapse tabs and newlines to single space.
# Parenthesized comments and bracketed IPs are preserved.
JUNK_PATTERN = r"[\t\n]+"

# ------------------------------------------------------------------ #
# Received header parsing — RFC 5321 §4.4 grammar:
#
# Received = "Received:" *( received-token / comment ) ";" date-time
# received-token = "from" domain / "by" domain / "via" atom
# / "with" atom / "id" atom / "for" addr-spec
#
# Strategy: tokenize on clause keywords, then extract values per clause.
# This eliminates the duplicated boundary lookaheads of the old
# per-clause pattern list and matches the RFC grammar directly.
# ------------------------------------------------------------------ #

# Pattern that splits a received header into clause tokens.
# Matches each RFC 5321 keyword at a word boundary followed by its value,
# which extends up to the next keyword or semicolon.
# The keywords are: from, by, via, with (not "with cipher"), id, for,
# plus the non-standard envelope-from and envelope-sender.
_CLAUSE_SPLITTER = re.compile(
r"(?:^|\s+)"
r"(from|by|via|with(?!\s+cipher)|id|for|envelope-from|envelope-sender)"
r"\s+",
re.I,
)

# Extracts envelope-from email: envelope-from <addr>
_ENVELOPE_FROM_RE = re.compile(r"<([^>]+)>")

# Date after semicolon (standard RFC 5321)
_DATE_RE = re.compile(r";\s*(.*)", re.DOTALL)

# SendGrid non-standard date format (no semicolon)
_SENDGRID_DATE_RE = re.compile(
r"(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d{9}\s+\+0000\s+UTC)"
r"\s+m=\+\d+\.\d+",
re.I,
)

EPILOGUE_DEFECTS = {"StartBoundaryNotFoundDefect"}

Expand Down
54 changes: 33 additions & 21 deletions src/mailparser/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@

import base64
import email
import email.utils
import ipaddress
import json
import logging
import os

from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP
from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP, REGXIP6
from mailparser.utils import (
convert_mail_date,
decode_header_part,
Expand Down Expand Up @@ -122,12 +123,13 @@ def __init__(self, message=None):
Init a new object from a message object structure.
"""
self._message = message
log.debug("All headers of emails: {}".format(", ".join(message.keys())))
if message is not None:
log.debug("All headers of emails: {}".format(", ".join(message.keys())))
self.parse()

def __str__(self):
def __str__(self) -> str:
if self.message:
return self.subject
return str(self.subject)
else:
return str()

Expand Down Expand Up @@ -326,13 +328,12 @@ def parse(self):
"{}".format("--" + self.message.get_boundary() + "--"),
)

try:
p = email.message_from_string(epilogue)
parts.append(p)
except TypeError:
log.debug("Failed to get epilogue part for TypeError")
except Exception:
log.error("Failed to get epilogue part. Check raw mail.")
if epilogue is not None:
try:
p = email.message_from_string(epilogue)
parts.append(p)
except Exception:
log.error("Failed to get epilogue part. Check raw mail.")

# walk all mail parts
for i, p in enumerate(parts):
Expand Down Expand Up @@ -497,6 +498,9 @@ def get_server_ipaddress(self, trust):
if not trust.strip():
return

if not self.message:
return

received = self.message.get_all("received", [])

for i in received:
Expand All @@ -510,14 +514,22 @@ def get_server_ipaddress(self, trust):
def _extract_ip(self, received_header):
"""
Extract the IP address from the received header if it is not private.
Supports both IPv4 (RFC 791) and IPv6 (RFC 5952) addresses.

Args:
received_header (string): The received header string

Returns:
string with the ip address or None
"""
check = REGXIP.findall(received_header[0 : received_header.find("by")])
by_idx = received_header.find("by")
from_part = received_header[:by_idx] if by_idx != -1 else received_header

# Try IPv4 first, then IPv6
check = REGXIP.findall(from_part)
if not check:
check = REGXIP6.findall(from_part)

if check:
try:
ip_str = str(check[-1])
Expand Down Expand Up @@ -551,12 +563,12 @@ def __getattr__(self, name):
# raw headers
elif name.endswith("_raw"):
name = name[:-4]
raw = self.message.get_all(name)
raw = self.message.get_all(name) if self.message else None
return json.dumps(raw, ensure_ascii=False)

# object headers
elif name_header in ADDRESSES_HEADERS:
raw_header = self.message.get(name_header, "")
raw_header = self.message.get(name_header, "") if self.message else ""
# parse before decoding
parsed_addresses = email.utils.getaddresses([raw_header], strict=True)

Expand Down Expand Up @@ -605,7 +617,7 @@ def received_raw(self):
Return a list of all received headers in raw format
"""
output = []
for i in self.message.get_all("received", []):
for i in self.message.get_all("received", []) if self.message else []:
output.append(decode_header_part(i))
return output

Expand All @@ -624,7 +636,7 @@ def headers(self) -> dict:
"""
Return only the headers as Python object
"""
all_headers = set(self.message.keys()) - set(["headers"])
all_headers = set(self.message.keys() if self.message else []) - {"headers"}
return {i: getattr(self, i) for i in all_headers}

@property
Expand Down Expand Up @@ -660,7 +672,7 @@ def date(self):
"""
Return the mail date in datetime.datetime format and UTC.
"""
date = self.message.get("date")
date = self.message.get("date") if self.message else None
conv = None

try:
Expand All @@ -674,7 +686,7 @@ def timezone(self):
"""
Return timezone. Offset from UTC.
"""
date = self.message.get("date")
date = self.message.get("date") if self.message else None
timezone = 0

try:
Expand Down Expand Up @@ -703,7 +715,7 @@ def mail_json(self):
"""
Return the JSON of mail parsed
"""
if self.mail.get("date"):
if self.mail.get("date") and self.date:
self._mail["date"] = self.date.isoformat()
return json.dumps(self.mail, ensure_ascii=False, indent=2)

Expand All @@ -720,7 +732,7 @@ def mail_partial_json(self):
"""
Return the JSON of mail parsed partial
"""
if self.mail_partial.get("date"):
if self.mail_partial.get("date") and self.date:
self._mail_partial["date"] = self.date.isoformat()
return json.dumps(self.mail_partial, ensure_ascii=False, indent=2)

Expand Down Expand Up @@ -758,7 +770,7 @@ def message_as_string(self):
"""
Return the entire message flattened as a string.
"""
return self.message.as_string()
return self.message.as_string() if self.message else ""

@property
def to_domains(self):
Expand Down
Loading
Loading