Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions mypy/typeshed/stubs/librt/librt/strings.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def read_f64_le(b: bytes, index: i64, /) -> float: ...
def read_f64_be(b: bytes, index: i64, /) -> float: ...

# Codepoint classification helpers operating on i32 codepoints (typically
# obtained via ord(s[i])). Negative inputs return False.
# obtained via ord(s[i])). Out-of-range inputs (negative, or past the maximum
# Unicode code point 0x10FFFF) return False.
def isspace(c: i32, /) -> bool: ...
def isdigit(c: i32, /) -> bool: ...
def isalnum(c: i32, /) -> bool: ...
Expand All @@ -53,7 +54,8 @@ def isidentifier(c: i32, /) -> bool: ...
# uppercase / lowercase expands to multiple codepoints (e.g. U+00DF
# uppercases to "SS", U+FB01 to "FI"), returns the input unchanged so
# the signature stays i32 -> i32. Use str.upper() / str.lower() for full
# Unicode case conversion when those cases matter. Negative inputs are
# returned unchanged.
# Unicode case conversion when those cases matter. Out-of-range inputs
# (negative, or past the maximum Unicode code point 0x10FFFF) are returned
# unchanged.
def toupper(c: i32, /) -> i32: ...
def tolower(c: i32, /) -> i32: ...
7 changes: 6 additions & 1 deletion mypyc/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,12 @@ class ModDesc(NamedTuple):

LIBRT_MODULES = [
ModDesc("librt.internal", ["internal/librt_internal.c"], [], ["internal"]),
ModDesc("librt.strings", ["strings/librt_strings.c"], [], ["strings"]),
ModDesc(
"librt.strings",
["strings/librt_strings.c"],
["strings/librt_strings.h", "strings/librt_strings_common.h"],
["strings"],
),
ModDesc(
"librt.base64",
[
Expand Down
31 changes: 21 additions & 10 deletions mypyc/lib-rt/strings/librt_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ typedef struct {
} StringWriterObject;

// Codepoint classification helpers. Inputs are signed i32 for compatibility
// with mypyc's int32_rprimitive; negative values are non-codepoints and
// return false. Defined `static inline` so they compile statically into
// with mypyc's int32_rprimitive; out-of-range values (negative, or past the
// maximum Unicode code point 0x10FFFF) are non-codepoints and return false.
// Defined `static inline` so they compile statically into
// both the librt.strings module and any mypyc-compiled extension that
// includes this header, avoiding the capsule indirection that would dwarf
// the work of a single Py_UNICODE_IS* macro call.
Expand All @@ -58,12 +59,14 @@ static inline bool LibRTStrings_IsAlpha(int32_t c) {
// PyUnicode_IsIdentifier on a 1-character string. Aborts via
// CPyError_OutOfMemory on allocation failure to keep this ERR_NEVER.
static inline bool LibRTStrings_IsIdentifier(int32_t c) {
if (c < 0) return false;
if (c < 128) {
// Unsigned compare: negatives wrap to large values and skip the fast path.
if ((uint32_t)c < 128) {
return (c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| c == '_';
}
// Reject negatives and code points past the Unicode maximum.
if ((uint32_t)c > 0x10FFFF) return false;
PyObject *s = PyUnicode_FromOrdinal((int)c);
if (s == NULL) {
CPyError_OutOfMemory();
Expand Down Expand Up @@ -101,19 +104,27 @@ static inline int32_t LibRTStrings_ChangeCase_slow(int32_t c, const char *method
// non-ASCII delegates to str.upper on a 1-character string. Returns the
// input unchanged when uppercasing expands to multiple codepoints.
static inline int32_t LibRTStrings_ToUpper(int32_t c) {
if (c < 0) return c;
if (c >= 'a' && c <= 'z') return c - 32;
if (c < 128) return c;
// Unsigned compare: negatives wrap to large values and skip the fast path.
if ((uint32_t)c < 128) {
if (c >= 'a' && c <= 'z') return c - 32;
return c;
}
// Negatives and code points past the Unicode maximum are returned unchanged.
if ((uint32_t)c > 0x10FFFF) return c;
return LibRTStrings_ChangeCase_slow(c, "upper");
}

// Lowercase a codepoint. ASCII fast path is `A..Z -> a..z` (add 32);
// non-ASCII delegates to str.lower on a 1-character string. Returns the
// input unchanged when lowercasing expands to multiple codepoints.
static inline int32_t LibRTStrings_ToLower(int32_t c) {
if (c < 0) return c;
if (c >= 'A' && c <= 'Z') return c + 32;
if (c < 128) return c;
// Unsigned compare: negatives wrap to large values and skip the fast path.
if ((uint32_t)c < 128) {
if (c >= 'A' && c <= 'Z') return c + 32;
return c;
}
// Negatives and code points past the Unicode maximum are returned unchanged.
if ((uint32_t)c > 0x10FFFF) return c;
return LibRTStrings_ChangeCase_slow(c, "lower");
}

Expand Down
18 changes: 14 additions & 4 deletions mypyc/test-data/run-librt-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -1449,8 +1449,9 @@ from testutil import assertRaises


def test_codepoint_classifiers() -> None:
# Negative values are not codepoints.
for bad in (i32(-1), i32(-113)):
# Out-of-range values are not codepoints: negative, just past the maximum
# valid code point (0x10FFFF), and the largest i32.
for bad in (i32(-1), i32(-113), i32(0x110000), i32(0x7FFFFFFF)):
assert not isspace(bad)
assert not isdigit(bad)
assert not isalnum(bad)
Expand Down Expand Up @@ -1485,6 +1486,10 @@ def test_codepoint_classifiers_via_any() -> None:
assert f(ord(false_input)) is False
# Negative values are valid i32, just not codepoints.
assert f(-1) is False
# Values within i32 range but past the maximum code point (0x10FFFF)
# are not codepoints either.
assert f(0x110000) is False
assert f(0x7FFFFFFF) is False
# Inputs outside i32 range raise OverflowError through the wrapper.
with assertRaises(OverflowError, "codepoint out of i32 range"):
f(1 << 40)
Expand All @@ -1509,8 +1514,9 @@ def _expect(c: str, method: str) -> int:


def test_codepoint_case_conversion() -> None:
# Negative inputs return unchanged.
for bad in (i32(-1), i32(-113)):
# Out-of-range inputs return unchanged: negative, just past the maximum
# valid code point (0x10FFFF), and the largest i32.
for bad in (i32(-1), i32(-113), i32(0x110000), i32(0x7FFFFFFF)):
assert toupper(bad) == bad
assert tolower(bad) == bad
# Agree with str.upper / str.lower across the full Unicode range
Expand All @@ -1534,6 +1540,10 @@ def test_codepoint_case_conversion_via_any() -> None:
assert f(in_cp) == out_cp
# Negative values are valid i32, returned unchanged.
assert f(-1) == -1
# Values within i32 range but past the maximum code point (0x10FFFF)
# are returned unchanged.
assert f(0x110000) == 0x110000
assert f(0x7FFFFFFF) == 0x7FFFFFFF
# Inputs outside i32 range raise OverflowError through the wrapper.
with assertRaises(OverflowError, "codepoint out of i32 range"):
f(1 << 40)
Expand Down
Loading