SpamScope · fedelemantuano · Mar 29, 2026 · Mar 16, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/.markdownlint.json b/.markdownlint.json
@@ -0,0 +1,15 @@
+{
+  "default": true,
+  "MD013": {
+    "line_length": 120,
+    "code_blocks": false,
+    "tables": false
+  },
+  "MD024": {
+    "siblings_only": true
+  },
+  "MD033": {
+    "allowed_elements": ["br", "img", "a", "details", "summary"]
+  },
+  "MD041": false
+}
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -28,9 +28,29 @@ repos:
     # Run the formatter.
     - id: ruff-format
 
-- repo: https://github.com/igorshubovych/markdownlint-cli
-  rev: v0.45.0
+- repo: https://github.com/executablebooks/mdformat
+  rev: 0.7.17
   hooks:
-    - id: markdownlint
-      args: ['--fix']
-      exclude: '^\.github/instructions/'
+    - id: mdformat
+      exclude: '^\.github/'
+      additional_dependencies:
+        - mdformat-frontmatter
+        - mdformat-gfm
+        - mdformat-tables
+
+- repo: https://github.com/DavidAnson/markdownlint-cli2
+  rev: v0.18.1
+  hooks:
+    - id: markdownlint-cli2
+      args: ['--config', '.markdownlint.json']
+      exclude: '^\.github/'
+
+- repo: local
+  hooks:
+    - id: pyright
+      name: pyright
+      entry: uv run npx pyright
+      args: [src/, tests/]
+      language: system
+      pass_filenames: false
+      types: [python]
diff --git a/src/mailparser/const.py b/src/mailparser/const.py
@@ -18,66 +18,66 @@
 
 import re
 
-REGXIP = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
-
-JUNK_PATTERN = r"[ \(\)\[\]\t\n]+"
-
-# Patterns for receiveds
-RECEIVED_PATTERNS = [
-    # FIXED: More restrictive 'from' clause
-    # Only matches 'from' at the beginning of the header (^) or after
-    # newline/whitespace to avoid matching within "for <email> from <email>"
-    # constructs which caused duplicate matches in IBM gateway headers
-    (
-        r"(?:(?:^|\n\s*)from\s+(?P<from>.+?)(?:\s*[(]?"
-        r"envelope-from|\s*[(]?envelope-sender|\s+"
-        r"by|\s+with(?! cipher)|\s+id|\s+via|;))"
-    ),
-    # IMPROVED: More precise 'by' clause
-    # Modified to not consume 'with' clause, allowing proper separation
-    # of 'by' (server name) and 'with' (protocol) fields
-    (
-        r"(?:(?:^|\s)by\s+(?P<by>[^\s]+(?:\s+[^\s]+)*?)"
-        r"(?:\s+with(?! cipher)|\s*[(]?envelope-from|\s*"
-        r"[(]?envelope-sender|\s+id|\s+for|\s+via|;))"
-    ),
-    # IMPROVED: 'with' clause with better boundary detection
-    (
-        r"(?:(?:^|\s)with(?! cipher)\s+(?P<with>.+?)"
-        r"(?:\s*[(]?envelope-from|\s*[(]?"
-        r"envelope-sender|\s+id|\s+for|\s+via|;))"
-    ),
-    # IMPROVED: 'id' clause with cleaner boundaries
-    (
-        r"(?:(?:^|\s)id\s+(?P<id>.+?)(?:\s*[(]?envelope-from|\s*"
-        r"[(]?envelope-sender|\s+for|\s+via|;))"
-    ),
-    # IMPROVED: 'for' clause - handles "for <email> from <email>" pattern
-    # Stops before 'from' keyword to prevent the 'from' pattern from
-    # matching the sender email in this construct
-    (
-        r"(?:(?:^|\s)for\s+(?P<for><[^>]+>|[^\s]+)"
-        r"(?:\s+from|\s*[(]?envelope-from|\s*[(]?"
-        r"envelope-sender|\s+via|;))"
-    ),
-    # IMPROVED: 'via' clause with better termination
-    (
-        r"(?:(?:^|\s)via\s+(?P<via>.+?)(?:\s*[(]?"
-        r"envelope-from|\s*[(]?envelope-sender|;))"
-    ),
-    # assumes emails are always inside <>
-    r"(?:envelope-from\s+<(?P<envelope_from>.+?)>)",
-    r"(?:envelope-sender\s+<(?P<envelope_sender>.+?)>)",
-    # datetime comes after ; at the end
-    r";\s*(?P<date>.*)",
-    # sendgrid datetime
-    (
-        r"(?P<date>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:"
-        r"\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+"
-    ),
-]
-
-RECEIVED_COMPILED_LIST = [re.compile(i, re.I | re.DOTALL) for i in RECEIVED_PATTERNS]
+# IPv4 pattern - validates octet range (0-255) per RFC 791
+REGXIP = re.compile(
+    r"(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
+    r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)"
+)
+
+# IPv6 pattern - matches standard and common compressed forms per RFC 5952
+REGXIP6 = re.compile(
+    r"(?:(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}"  # full form
+    r"|(?:[0-9a-fA-F]{1,4}:){1,7}:"  # trailing ::
+    r"|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}"  # :: with 1 group after
+    r"|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}"
+    r"|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}"
+    r"|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}"
+    r"|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}"
+    r"|[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}"
+    r"|:(?::[0-9a-fA-F]{1,4}){1,7}"  # ::x:x...
+    r"|::)"  # just ::
+)
+
+# Normalize whitespace: collapse tabs and newlines to single space.
+# Parenthesized comments and bracketed IPs are preserved.
+JUNK_PATTERN = r"[\t\n]+"
+
+# ------------------------------------------------------------------ #
+# Received header parsing — RFC 5321 §4.4 grammar:
+#
+#   Received     = "Received:" *( received-token / comment ) ";" date-time
+#   received-token = "from" domain / "by" domain / "via" atom
+#                  / "with" atom  / "id"  atom   / "for" addr-spec
+#
+# Strategy: tokenize on clause keywords, then extract values per clause.
+# This eliminates the duplicated boundary lookaheads of the old
+# per-clause pattern list and matches the RFC grammar directly.
+# ------------------------------------------------------------------ #
+
+# Pattern that splits a received header into clause tokens.
+# Matches each RFC 5321 keyword at a word boundary followed by its value,
+# which extends up to the next keyword or semicolon.
+# The keywords are: from, by, via, with (not "with cipher"), id, for,
+# plus the non-standard envelope-from and envelope-sender.
+_CLAUSE_SPLITTER = re.compile(
+    r"(?:^|\s+)"
+    r"(from|by|via|with(?!\s+cipher)|id|for|envelope-from|envelope-sender)"
+    r"\s+",
+    re.I,
+)
+
+# Extracts envelope-from email: envelope-from <addr>
+_ENVELOPE_FROM_RE = re.compile(r"<([^>]+)>")
+
+# Date after semicolon (standard RFC 5321)
+_DATE_RE = re.compile(r";\s*(.*)", re.DOTALL)
+
+# SendGrid non-standard date format (no semicolon)
+_SENDGRID_DATE_RE = re.compile(
+    r"(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\.\d{9}\s+\+0000\s+UTC)"
+    r"\s+m=\+\d+\.\d+",
+    re.I,
+)
 
 EPILOGUE_DEFECTS = {"StartBoundaryNotFoundDefect"}
 

diff --git a/src/mailparser/core.py b/src/mailparser/core.py
@@ -18,12 +18,13 @@
 
 import base64
 import email
+import email.utils
 import ipaddress
 import json
 import logging
 import os
 
-from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP
+from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP, REGXIP6
 from mailparser.utils import (
     convert_mail_date,
     decode_header_part,
@@ -122,12 +123,13 @@ def __init__(self, message=None):
         Init a new object from a message object structure.
         """
         self._message = message
-        log.debug("All headers of emails: {}".format(", ".join(message.keys())))
+        if message is not None:
+            log.debug("All headers of emails: {}".format(", ".join(message.keys())))
         self.parse()
 
-    def __str__(self):
+    def __str__(self) -> str:
         if self.message:
-            return self.subject
+            return str(self.subject)
         else:
             return str()
 
@@ -326,13 +328,12 @@ def parse(self):
                 "{}".format("--" + self.message.get_boundary() + "--"),
             )
 
-            try:
-                p = email.message_from_string(epilogue)
-                parts.append(p)
-            except TypeError:
-                log.debug("Failed to get epilogue part for TypeError")
-            except Exception:
-                log.error("Failed to get epilogue part. Check raw mail.")
+            if epilogue is not None:
+                try:
+                    p = email.message_from_string(epilogue)
+                    parts.append(p)
+                except Exception:
+                    log.error("Failed to get epilogue part. Check raw mail.")
 
         # walk all mail parts
         for i, p in enumerate(parts):
@@ -497,6 +498,9 @@ def get_server_ipaddress(self, trust):
         if not trust.strip():
             return
 
+        if not self.message:
+            return
+
         received = self.message.get_all("received", [])
 
         for i in received:
@@ -510,14 +514,22 @@ def get_server_ipaddress(self, trust):
     def _extract_ip(self, received_header):
         """
         Extract the IP address from the received header if it is not private.
+        Supports both IPv4 (RFC 791) and IPv6 (RFC 5952) addresses.
 
         Args:
             received_header (string): The received header string
 
         Returns:
             string with the ip address or None
         """
-        check = REGXIP.findall(received_header[0 : received_header.find("by")])
+        by_idx = received_header.find("by")
+        from_part = received_header[:by_idx] if by_idx != -1 else received_header
+
+        # Try IPv4 first, then IPv6
+        check = REGXIP.findall(from_part)
+        if not check:
+            check = REGXIP6.findall(from_part)
+
         if check:
             try:
                 ip_str = str(check[-1])
@@ -551,12 +563,12 @@ def __getattr__(self, name):
         # raw headers
         elif name.endswith("_raw"):
             name = name[:-4]
-            raw = self.message.get_all(name)
+            raw = self.message.get_all(name) if self.message else None
             return json.dumps(raw, ensure_ascii=False)
 
         # object headers
         elif name_header in ADDRESSES_HEADERS:
-            raw_header = self.message.get(name_header, "")
+            raw_header = self.message.get(name_header, "") if self.message else ""
             # parse before decoding
             parsed_addresses = email.utils.getaddresses([raw_header], strict=True)
 
@@ -605,7 +617,7 @@ def received_raw(self):
         Return a list of all received headers in raw format
         """
         output = []
-        for i in self.message.get_all("received", []):
+        for i in self.message.get_all("received", []) if self.message else []:
             output.append(decode_header_part(i))
         return output
 
@@ -624,7 +636,7 @@ def headers(self) -> dict:
         """
         Return only the headers as Python object
         """
-        all_headers = set(self.message.keys()) - set(["headers"])
+        all_headers = set(self.message.keys() if self.message else []) - {"headers"}
         return {i: getattr(self, i) for i in all_headers}
 
     @property
@@ -660,7 +672,7 @@ def date(self):
         """
         Return the mail date in datetime.datetime format and UTC.
         """
-        date = self.message.get("date")
+        date = self.message.get("date") if self.message else None
         conv = None
 
         try:
@@ -674,7 +686,7 @@ def timezone(self):
         """
         Return timezone. Offset from UTC.
         """
-        date = self.message.get("date")
+        date = self.message.get("date") if self.message else None
         timezone = 0
 
         try:
@@ -703,7 +715,7 @@ def mail_json(self):
         """
         Return the JSON of mail parsed
         """
-        if self.mail.get("date"):
+        if self.mail.get("date") and self.date:
             self._mail["date"] = self.date.isoformat()
         return json.dumps(self.mail, ensure_ascii=False, indent=2)
 
@@ -720,7 +732,7 @@ def mail_partial_json(self):
         """
         Return the JSON of mail parsed partial
         """
-        if self.mail_partial.get("date"):
+        if self.mail_partial.get("date") and self.date:
             self._mail_partial["date"] = self.date.isoformat()
         return json.dumps(self.mail_partial, ensure_ascii=False, indent=2)
 
@@ -758,7 +770,7 @@ def message_as_string(self):
         """
         Return the entire message flattened as a string.
         """
-        return self.message.as_string()
+        return self.message.as_string() if self.message else ""
 
     @property
     def to_domains(self):