Skip to content
55 changes: 55 additions & 0 deletions Lib/test/test_tomllib/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
from pathlib import Path
import sys
import tempfile
import textwrap
import unittest
from test import support
from test.support import os_helper
from test.support.script_helper import assert_python_ok

from . import tomllib

Expand Down Expand Up @@ -124,3 +127,55 @@ def test_types_import(self):
never imported by tests.
"""
importlib.import_module(f"{tomllib.__name__}._types")

def test_try_simple_decimal(self):
try_simple_decimal = tomllib._parser.try_simple_decimal
self.assertEqual(try_simple_decimal("123", 0), (3, 123))
self.assertEqual(try_simple_decimal("123\n", 0), (3, 123))
self.assertEqual(try_simple_decimal("123 456", 0), (3, 123))
self.assertEqual(try_simple_decimal("+123\n", 0), (4, 123))
self.assertEqual(try_simple_decimal("-123\n", 0), (4, -123))
self.assertEqual(try_simple_decimal("0\n", 0), (1, 0))
self.assertEqual(try_simple_decimal("+0\n", 0), (2, 0))
self.assertEqual(try_simple_decimal("-0\n", 0), (2, 0))
self.assertEqual(try_simple_decimal("[23]\n", 1), (3, 23))
self.assertEqual(try_simple_decimal("[23, 24]\n", 1), (3, 23))
self.assertEqual(try_simple_decimal("{x = 42}\n", 5), (7, 42))

self.assertIsNone(try_simple_decimal("+", 0), None)
self.assertIsNone(try_simple_decimal("-", 0), None)
self.assertIsNone(try_simple_decimal("+\n", 0), None)
self.assertIsNone(try_simple_decimal("-\n", 0), None)
self.assertIsNone(try_simple_decimal("+inf\n", 0), None)
self.assertIsNone(try_simple_decimal("-nan\n", 0), None)
self.assertIsNone(try_simple_decimal("0123\n", 0))
self.assertIsNone(try_simple_decimal("1979-05-27\n", 0))
self.assertIsNone(try_simple_decimal("12:32:00\n", 0))
self.assertIsNone(try_simple_decimal("1.0\n", 0))
self.assertIsNone(try_simple_decimal("1_000\n", 0))
self.assertIsNone(try_simple_decimal("0x123\n", 0))
self.assertIsNone(try_simple_decimal("0o123\n", 0))
self.assertIsNone(try_simple_decimal("0b100\n", 0))

def test_lazy_import(self):
# Test that try_simple_decimal() can parse the TOML file without
# importing regular expressions (tomllib._re)
filename = os_helper.TESTFN
self.addCleanup(os_helper.unlink, filename)
toml = textwrap.dedent("""
[metadata]
int = 123
list = [+1, -2, 3]
table = {x=1, y=2}
""")
with open(filename, "w") as fp:
fp.write(toml)

code = textwrap.dedent(f"""
import sys, tomllib
with open({filename!a}, "rb") as fp:
tomllib.load(fp)
print("lazy import?", 'tomllib._re' not in sys.modules)
""")
proc = assert_python_ok('-c', code)
self.assertIn(b'lazy import? True', proc.out)
56 changes: 54 additions & 2 deletions Lib/tomllib/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

from __future__ import annotations

from types import MappingProxyType
# Defer loading regular expressions until we actually need them in
# parse_value(). Before that, use try_simple_decimal() to parse simple
# decimal numbers.
__lazy_modules__ = ["tomllib._re"]

from ._re import (
Comment thread
vstinner marked this conversation as resolved.
RE_DATETIME,
Expand Down Expand Up @@ -42,7 +45,18 @@
KEY_INITIAL_CHARS: Final = BARE_KEY_CHARS | frozenset("\"'")
HEXDIGIT_CHARS: Final = frozenset("abcdef" "ABCDEF" "0123456789")

BASIC_STR_ESCAPE_REPLACEMENTS: Final = MappingProxyType(
# If one of these follows a "simple decimal" it could mean that
# the value is actually something else (float, datetime...), so
# optimized parsing should be abandoned.
ILLEGAL_AFTER_SIMPLE_DECIMAL: Final = frozenset(
"eE." # decimal
"xbo" # hex, bin, oct
"-" # datetime
":" # localtime
"_0123456789" # complex decimal
)

BASIC_STR_ESCAPE_REPLACEMENTS: Final = frozendict( # type: ignore[name-defined]
{
"\\b": "\u0008", # backspace
"\\t": "\u0009", # tab
Expand Down Expand Up @@ -665,6 +679,37 @@ def parse_basic_str(src: str, pos: Pos, *, multiline: bool) -> tuple[Pos, str]:
pos += 1


def try_simple_decimal(
src: str, pos: Pos
) -> None | tuple[Pos, int]:
"""Parse a "simple" decimal integer.

An optimization that tries to parse a simple decimal integer
without underscores. Returns `None` if there's any uncertainty
on correctness.
"""
start_pos = pos

if src.startswith(("+", "-"), pos):
pos += 1

if src.startswith("0", pos):
pos += 1
elif src.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9"), pos):
pos = skip_chars(src, pos, "0123456789")
else:
return None

try:
next_char = src[pos]
except IndexError:
next_char = None
if next_char in ILLEGAL_AFTER_SIMPLE_DECIMAL:
return None

return pos, int(src[start_pos:pos])


def parse_value(
src: str, pos: Pos, parse_float: ParseFloat
) -> tuple[Pos, Any]:
Expand Down Expand Up @@ -703,6 +748,13 @@ def parse_value(
if char == "{":
return parse_inline_table(src, pos, parse_float)

# Try a simple parser for decimal numbers. If it's able to parse all
# numbers, it avoids importing tomllib._re which has an impact on
# the tomllib startup time.
number = try_simple_decimal(src, pos)
if number is not None:
return number

# Dates and times
datetime_match = RE_DATETIME.match(src, pos)
if datetime_match:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve :mod:`tomllib` import time (up to 10x faster). Patch by Victor
Stinner.
Loading