diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java
index cfff664162..c9755764a6 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -103,11 +103,11 @@ private static Charset tryGetCharsetFromLine(String line, boolean hasBOM) {
if (hasBOM && !normalizedEncoding.equalsUncached(T_UTF_UNDERSCORE_8, TS_ENCODING)) {
throw new InvalidEncodingException(encoding + " with BOM");
}
- Charset charset = CharsetMapping.getCharsetNormalized(normalizedEncoding);
+ CharsetMapping.CharsetWrapper charset = CharsetMapping.getCharsetNormalized(normalizedEncoding);
if (charset == null) {
throw new InvalidEncodingException(encoding);
}
- return charset;
+ return charset.charset();
}
return null;
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java
index b6810489fd..edc79b1f4f 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java
@@ -1198,7 +1198,7 @@ TruffleString sourceAsString(VirtualFrame frame, Node inliningTarget, Object sou
private TruffleString doDecodeSource(Object source, TruffleString filename, byte[] bytes, int bytesLen) {
Charset charset = PythonFileDetector.findEncodingStrict(bytes, bytesLen);
TruffleString pythonEncoding = CharsetMapping.getPythonEncodingNameFromJavaName(charset.name());
- CodecsModuleBuiltins.TruffleDecoder decoder = new CodecsModuleBuiltins.TruffleDecoder(pythonEncoding, charset, bytes, bytesLen, CodingErrorAction.REPORT);
+ CodecsModuleBuiltins.TruffleDecoder decoder = new CodecsModuleBuiltins.TruffleDecoder(charset, bytes, bytesLen, CodingErrorAction.REPORT);
if (!decoder.decodingStep(true)) {
int pos = decoder.getInputPosition();
Object exception = CallNode.executeUncached(PythonBuiltinClassType.UnicodeDecodeError, pythonEncoding, source, pos, pos + decoder.getErrorLength(), decoder.getErrorReason());
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java
index f8e1888cf1..68fde3ee71 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -72,6 +72,7 @@
import static com.oracle.graal.python.util.PythonUtils.tsLiteral;
import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
@@ -133,7 +134,9 @@
import com.oracle.graal.python.runtime.sequence.storage.ObjectSequenceStorage;
import com.oracle.graal.python.util.CharsetMapping;
import com.oracle.graal.python.util.CharsetMapping.NormalizeEncodingNameNode;
+import com.oracle.graal.python.util.PythonUtils;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
+import com.oracle.truffle.api.HostCompilerDirectives;
import com.oracle.truffle.api.dsl.Bind;
import com.oracle.truffle.api.dsl.Cached;
import com.oracle.truffle.api.dsl.Fallback;
@@ -182,6 +185,44 @@ protected List extends NodeFactory extends PythonBuiltinBaseNode>> getNodeFa
return CodecsModuleBuiltinsFactory.getFactories();
}
+ @GenerateUncached
+ @GenerateInline
+ public abstract static class CharsetLookupNode extends Node {
+ public abstract CharsetMapping.CharsetWrapper execute(Node inliningTarget, TruffleString name);
+
+ @SuppressWarnings("unused")
+ @Specialization(guards = "name == cachedName", limit = "1")
+ static CharsetMapping.CharsetWrapper doCachedIdentity(TruffleString name,
+ @Cached("name") TruffleString cachedName,
+ @Cached("lookup(name)") CharsetMapping.CharsetWrapper cachedResult) {
+ return cachedResult;
+ }
+
+ @SuppressWarnings("unused")
+ @Specialization(guards = "equals(name, cachedName, equalNode)", limit = "1", replaces = "doCachedIdentity")
+ static CharsetMapping.CharsetWrapper doCachedEqual(TruffleString name,
+ @Cached("name") TruffleString cachedName,
+ @Cached("lookup(name)") CharsetMapping.CharsetWrapper cachedResult,
+ @Cached TruffleString.EqualNode equalNode) {
+ return cachedResult;
+ }
+
+ @Specialization(replaces = "doCachedEqual")
+ static CharsetMapping.CharsetWrapper doDynamic(Node inliningTarget, TruffleString name,
+ @Cached NormalizeEncodingNameNode normalizeEncodingNameNode) {
+ return CharsetMapping.getCharsetNormalized(normalizeEncodingNameNode.execute(inliningTarget, name));
+ }
+
+ @SuppressWarnings("unused")
+ static CharsetMapping.CharsetWrapper lookup(TruffleString name) {
+ return CharsetMapping.getCharsetNormalized(CharsetMapping.normalizeUncached(name));
+ }
+
+ static boolean equals(TruffleString a, TruffleString b, TruffleString.EqualNode equalNode) {
+ return equalNode.execute(a, b, TS_ENCODING);
+ }
+ }
+
@GenerateUncached
@GenerateInline(false) // footprint reduction 48 -> 30
public abstract static class CodecsEncodeToJavaBytesNode extends Node {
@@ -191,6 +232,11 @@ public abstract static class CodecsEncodeToJavaBytesNode extends Node {
byte[] encode(VirtualFrame frame, Object self, TruffleString encoding, TruffleString errors,
@Bind Node inliningTarget,
@Cached CastToTruffleStringNode castTruffleStr,
+ @Cached TruffleString.IsValidNode isValidNode,
+ @Cached TruffleString.GetCodeRangeNode getCodeRangeNode,
+ @Cached InlinedConditionProfile fastPathProfile,
+ @Cached TruffleString.SwitchEncodingNode switchEncodingNode,
+ @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode,
@Cached TruffleString.ToJavaStringNode toJavaStringNode,
@Cached TruffleString.EqualNode equalNode,
@Cached ErrorHandlers.CallEncodingErrorHandlerNode errorHandler,
@@ -198,17 +244,67 @@ byte[] encode(VirtualFrame frame, Object self, TruffleString encoding, TruffleSt
@CachedLibrary(limit = "3") PythonBufferAccessLibrary bufferLib,
@Cached CastToJavaStringNode castToJavaStringNode,
@Cached PRaiseNode raiseNode,
- @Cached NormalizeEncodingNameNode normalizeEncodingNameNode) {
+ @Cached(inline = true) CharsetLookupNode charsetLookupNode) {
TruffleString input = castTruffleStr.castKnownString(inliningTarget, self);
- String inputStr = toJavaStringNode.execute(input);
- CodingErrorAction errorAction = convertCodingErrorAction(errors, equalNode);
- TruffleString normalizedEncoding = normalizeEncodingNameNode.execute(inliningTarget, encoding);
- Charset charset = CharsetMapping.getCharsetNormalized(normalizedEncoding);
- if (charset == null) {
+ CharsetMapping.CharsetWrapper charsetWrapper = charsetLookupNode.execute(inliningTarget, encoding);
+ if (charsetWrapper == null) {
throw raiseNode.raise(inliningTarget, LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding);
}
+ TruffleString.Encoding targetTStringEncoding = charsetWrapper.tStringEncoding();
+ if (fastPathProfile.profile(inliningTarget, isValidNode.execute(input, TS_ENCODING) && targetTStringEncoding != null)) {
+ byte[] ret = fastPath(input, getCodeRangeNode, switchEncodingNode, copyToByteArrayNode, targetTStringEncoding, charsetWrapper);
+ if (ret != null) {
+ return ret;
+ }
+ }
+ return slowPath(frame, encoding, errors, inliningTarget, toJavaStringNode, equalNode, errorHandler, acquireLib, bufferLib, castToJavaStringNode, raiseNode, input, charsetWrapper);
+ }
+
+ private static byte[] fastPath(TruffleString input,
+ TruffleString.GetCodeRangeNode getCodeRangeNode,
+ TruffleString.SwitchEncodingNode switchEncodingNode,
+ TruffleString.CopyToByteArrayNode copyToByteArrayNode,
+ TruffleString.Encoding targetTStringEncoding,
+ CharsetMapping.CharsetWrapper charsetWrapper) {
+ if (targetTStringEncoding == TruffleString.Encoding.US_ASCII || targetTStringEncoding == TruffleString.Encoding.ISO_8859_1) {
+ TruffleString.CodeRange codeRange = getCodeRangeNode.execute(input, TS_ENCODING);
+ if (codeRange.isSupersetOf(targetTStringEncoding == TruffleString.Encoding.US_ASCII ? TruffleString.CodeRange.LATIN_1 : TruffleString.CodeRange.BMP)) {
+ // string contains characters that cannot be represented in ASCII / LATIN-1.
+ // defer to slow path
+ return null;
+ }
+ }
+ TruffleString transcoded = switchEncodingNode.execute(input, targetTStringEncoding);
+ CharsetMapping.BOM bom = charsetWrapper.bom();
+ byte[] ret = new byte[transcoded.byteLength(targetTStringEncoding) + (bom == null ? 0 : bom.bytes.length)];
+ int startIndex;
+ if (bom == null) {
+ startIndex = 0;
+ } else {
+ System.arraycopy(bom.bytes, 0, ret, 0, bom.bytes.length);
+ startIndex = bom.bytes.length;
+ }
+ copyToByteArrayNode.execute(transcoded, 0, ret, startIndex, transcoded.byteLength(targetTStringEncoding), targetTStringEncoding);
+ return ret;
+ }
+
+ @HostCompilerDirectives.InliningCutoff
+ private byte[] slowPath(VirtualFrame frame, TruffleString encoding, TruffleString errors,
+ Node inliningTarget,
+ TruffleString.ToJavaStringNode toJavaStringNode,
+ TruffleString.EqualNode equalNode,
+ ErrorHandlers.CallEncodingErrorHandlerNode errorHandler,
+ PythonBufferAcquireLibrary acquireLib,
+ PythonBufferAccessLibrary bufferLib,
+ CastToJavaStringNode castToJavaStringNode,
+ PRaiseNode raiseNode,
+ TruffleString input,
+ CharsetMapping.CharsetWrapper charsetWrapper) {
+ String inputStr = toJavaStringNode.execute(input);
+ CodingErrorAction errorAction = convertCodingErrorAction(errors, equalNode);
TruffleEncoder encoder;
ErrorHandlers.ErrorHandlerCache errorHandlerCache = new ErrorHandlers.ErrorHandlerCache();
+ Charset charset = charsetWrapper.charset();
try {
encoder = new TruffleEncoder(charset, inputStr, errorAction);
while (!encoder.encodingStep()) {
@@ -290,8 +386,12 @@ static Object decode(VirtualFrame frame, Object input, TruffleString encoding, T
@Cached("createFor($node)") InteropCallData callData,
@CachedLibrary(limit = "3") PythonBufferAcquireLibrary acquireLib,
@CachedLibrary(limit = "3") PythonBufferAccessLibrary bufferLib,
+ @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
+ @Cached TruffleString.IsValidNode isValidNode,
+ @Cached TruffleString.SwitchEncodingNode switchEncodingNode,
+ @Cached InlinedConditionProfile fastPathProfile,
@Cached TruffleString.EqualNode equalNode,
- @Cached NormalizeEncodingNameNode normalizeEncodingNameNode,
+ @Cached(inline = true) CharsetLookupNode charsetLookupNode,
@Cached ErrorHandlers.CallDecodingErrorHandlerNode callDecodingErrorHandlerNode,
@Cached TruffleString.ToJavaStringNode toJavaStringNode,
@Cached InlinedBranchProfile inputReplaced,
@@ -300,16 +400,56 @@ static Object decode(VirtualFrame frame, Object input, TruffleString encoding, T
try {
int len = bufferLib.getBufferLength(buffer);
byte[] bytes = bufferLib.getInternalOrCopiedByteArray(buffer);
- CodingErrorAction errorAction = convertCodingErrorAction(errors, equalNode);
- TruffleString normalizedEncoding = normalizeEncodingNameNode.execute(inliningTarget, encoding);
- Charset charset = CharsetMapping.getCharsetForDecodingNormalized(normalizedEncoding, bytes, len);
+ CharsetMapping.CharsetWrapper charset = charsetLookupNode.execute(inliningTarget, encoding);
if (charset == null) {
throw raiseNode.raise(inliningTarget, LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding);
}
+ CharsetMapping.BOM bom = charset.bom();
+ int offset = 0;
+ if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN && bom != null) {
+ /*
+ * JDK's charsets for UTF-16 and UTF-32 default to big endian irrespective of
+ * the platform if there is no BOM. The UTF-16-LE and UTF-32-LE charsets reject
+ * big endian BOM. CPython defaults to platform endian and accepts both BOMs.
+ * So, in order to get the behavior we need, we have to take a peek at the
+ * possible BOM and if it has a BOM use the UTF-16/32 encoding and let it
+ * detect, otherwise default to UTF-16/32-LE.
+ */
+ if (charset == CharsetMapping.UTF_16LE_BOM) {
+ if (len >= 2) {
+ short first = PythonUtils.ARRAY_ACCESSOR.getShort(bytes, 0);
+ if (first == (short) 0xFFFE) {
+ charset = CharsetMapping.UTF_16BE_BOM;
+ offset = 2;
+ } else if (first == (short) 0xFEFF) {
+ offset = 2;
+ }
+ }
+ } else {
+ assert charset == CharsetMapping.UTF_32LE_BOM;
+ if (len >= 4) {
+ int first = PythonUtils.ARRAY_ACCESSOR.getInt(bytes, 0);
+ if (first == 0xFFFE0000) {
+ charset = CharsetMapping.UTF_32BE_BOM;
+ offset = 4;
+ } else if (first == 0x0000FEFF) {
+ offset = 4;
+ }
+ }
+ }
+ }
+ TruffleString.Encoding tStringEncoding = charset.tStringEncoding();
+ if (tStringEncoding != null && (len & (charset.stride() - 1)) == 0) {
+ TruffleString direct = fromByteArrayNode.execute(bytes, offset, len - offset, tStringEncoding, true);
+ if (fastPathProfile.profile(inliningTarget, isValidNode.execute(direct, tStringEncoding))) {
+ return PFactory.createTuple(language, new Object[]{switchEncodingNode.execute(direct, TS_ENCODING), len});
+ }
+ }
+ CodingErrorAction errorAction = convertCodingErrorAction(errors, equalNode);
ErrorHandlers.ErrorHandlerCache handlerCache = new ErrorHandlers.ErrorHandlerCache();
TruffleDecoder decoder;
try {
- decoder = new TruffleDecoder(normalizedEncoding, charset, bytes, len, errorAction);
+ decoder = new TruffleDecoder(charset.charset(), bytes, len, errorAction);
while (!decoder.decodingStep(finalData)) {
int pos = decoder.getInputPosition();
ErrorHandlers.DecodingErrorHandlerResult result = callDecodingErrorHandlerNode.execute(frame, inliningTarget, handlerCache, errors, encoding, input,
@@ -1367,15 +1507,13 @@ public void replace(String replacement, Charset charset) {
}
static class TruffleDecoder {
- private final TruffleString encodingName;
private final CharsetDecoder decoder;
private ByteBuffer inputBuffer;
private CharBuffer outputBuffer;
private CoderResult coderResult;
@TruffleBoundary
- public TruffleDecoder(TruffleString encodingName, Charset charset, byte[] input, int inputLen, CodingErrorAction errorAction) {
- this.encodingName = encodingName;
+ public TruffleDecoder(Charset charset, byte[] input, int inputLen, CodingErrorAction errorAction) {
this.inputBuffer = ByteBuffer.wrap(input, 0, inputLen);
this.decoder = charset.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction);
this.outputBuffer = CharBuffer.allocate((int) (inputLen * decoder.averageCharsPerByte()));
@@ -1473,8 +1611,5 @@ public void replace(int skipInput, char[] chars, int offset, int length) {
inputBuffer.position(inputBuffer.position() + skipInput);
}
- public TruffleString getEncodingName() {
- return encodingName;
- }
}
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java
index 4bf3baf235..76cb8a1a3a 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java
@@ -1236,12 +1236,12 @@ private void writeShortString(String v) throws IOException {
private TruffleString readShortString() {
int sz = readByteSize();
byte[] bytes = readNBytes(sz);
- return TruffleString.fromByteArrayUncached(bytes, 0, sz, Encoding.ISO_8859_1, false).switchEncodingUncached(TS_ENCODING);
+ return TruffleString.fromByteArrayWithCompactionUTF32Uncached(bytes, 0, sz, TruffleString.CompactionLevel.S1, false);
}
private Object readAscii(int sz, boolean intern) {
byte[] bytes = readNBytes(sz);
- TruffleString value = TruffleString.fromByteArrayUncached(bytes, 0, sz, Encoding.US_ASCII, false).switchEncodingUncached(TS_ENCODING);
+ TruffleString value = TruffleString.fromByteArrayWithCompactionUTF32Uncached(bytes, 0, sz, TruffleString.CompactionLevel.S1, false);
if (intern) {
return PythonUtils.internString(value);
} else {
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java
index ea8e21333d..4d3e669ea0 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -80,8 +80,6 @@
import static com.oracle.graal.python.nodes.StringLiterals.T_UTF8;
import static com.oracle.graal.python.nodes.util.CastToJavaIntLossyNode.castLong;
import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
-import static com.oracle.truffle.api.strings.TruffleString.Encoding.ISO_8859_1;
-import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_16;
import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_16LE;
import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_32LE;
import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_8;
@@ -175,6 +173,7 @@
import com.oracle.truffle.api.strings.TruffleString;
import com.oracle.truffle.api.strings.TruffleString.Encoding;
import com.oracle.truffle.api.strings.TruffleString.FromNativePointerNode;
+import com.oracle.truffle.api.strings.TruffleString.FromNativePointerWithCompactionUTF32Node;
import com.oracle.truffle.api.strings.TruffleString.SwitchEncodingNode;
import com.oracle.truffle.api.strings.TruffleStringBuilder;
import com.oracle.truffle.api.strings.TruffleStringBuilderUTF32;
@@ -810,11 +809,11 @@ static Object doGeneric(Object ptr, long elements, int charSize, int isAscii,
@CApiBuiltin(ret = PyObjectTransfer, args = {Pointer, Py_ssize_t, Int}, call = Ignored)
abstract static class GraalPyPrivate_Unicode_FromUCS extends CApiTernaryBuiltinNode {
- private static Encoding encodingFromKind(Node inliningTarget, int kind, PRaiseNode raiseNode) throws PException {
+ private static TruffleString.CompactionLevel compactionLevelFromKind(Node inliningTarget, int kind, PRaiseNode raiseNode) throws PException {
return switch (kind) {
- case 1 -> ISO_8859_1;
- case 2 -> UTF_16;
- case 4 -> TS_ENCODING;
+ case 1 -> TruffleString.CompactionLevel.S1;
+ case 2 -> TruffleString.CompactionLevel.S2;
+ case 4 -> TruffleString.CompactionLevel.S4;
default -> throw raiseNode.raiseBadInternalCall(inliningTarget);
};
}
@@ -822,19 +821,13 @@ private static Encoding encodingFromKind(Node inliningTarget, int kind, PRaiseNo
@Specialization
static Object doNative(Object ptr, long byteLength, int kind,
@Bind Node inliningTarget,
- @Cached FromNativePointerNode fromNativePointerNode,
- @Cached SwitchEncodingNode switchEncodingNode,
+ @Cached FromNativePointerWithCompactionUTF32Node fromNativePointerNode,
@Cached PRaiseNode raiseNode) {
try {
int iByteLength = PInt.intValueExact(byteLength);
- Encoding srcEncoding = encodingFromKind(inliningTarget, kind, raiseNode);
- /*
- * TODO(fa): TruffleString does currently not support creating strings from UCS1 and
- * UCS2 bytes (GR-44312). Remind: UCS1 and UCS2 are actually compacted UTF-32 bytes.
- * For now, we use ISO-8859-1 and UTF-16 but that's not entirely correct.
- */
- TruffleString ts = fromNativePointerNode.execute(ptr, 0, iByteLength, srcEncoding, true);
- return PFactory.createString(PythonLanguage.get(inliningTarget), switchEncodingNode.execute(ts, TS_ENCODING));
+ TruffleString.CompactionLevel compactionLevel = compactionLevelFromKind(inliningTarget, kind, raiseNode);
+ TruffleString ts = fromNativePointerNode.execute(ptr, 0, iByteLength, compactionLevel, true);
+ return PFactory.createString(PythonLanguage.get(inliningTarget), ts);
} catch (OverflowException e) {
throw raiseNode.raise(inliningTarget, MemoryError);
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cjkcodecs/MultibytecodecModuleBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cjkcodecs/MultibytecodecModuleBuiltins.java
index c7fbfab82e..fc9bc375cd 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cjkcodecs/MultibytecodecModuleBuiltins.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cjkcodecs/MultibytecodecModuleBuiltins.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -47,7 +47,6 @@
import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError;
import static com.oracle.graal.python.util.PythonUtils.toTruffleStringUncached;
-import java.nio.charset.Charset;
import java.util.List;
import com.oracle.graal.python.PythonLanguage;
@@ -102,13 +101,13 @@ protected static void registerCodec(String name, int cidx, CodecType ct, int mid
PythonModule codec, PythonLanguage language) {
TruffleString tsName = toTruffleStringUncached(name);
TruffleString normalizedEncoding = CharsetMapping.normalizeUncached(tsName);
- Charset charset = CharsetMapping.getCharsetNormalized(normalizedEncoding);
+ CharsetMapping.CharsetWrapper charset = CharsetMapping.getCharsetNormalized(normalizedEncoding);
if (charset != null) {
if (cidx != -1) {
- codecs[cidx] = new MultibyteCodec(tsName, charset, ct);
+ codecs[cidx] = new MultibyteCodec(tsName, charset.charset(), ct);
}
if (midx != -1) {
- DBCSMap h = maps[midx] = new DBCSMap(name, tsName, charset, mt);
+ DBCSMap h = maps[midx] = new DBCSMap(name, tsName, charset.charset(), mt);
codec.setAttribute(toTruffleStringUncached(h.charsetMapName),
PFactory.createCapsuleJavaName(language, h, PyMultibyteCodec_CAPSULE_NAME));
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/CharmapNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/CharmapNodes.java
index bd6962ace1..584f6c5fb3 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/CharmapNodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/CharmapNodes.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -369,15 +369,14 @@ static TruffleString decodeLatin1(VirtualFrame frame, Object data, @SuppressWarn
@Shared @Cached("createFor($node)") InteropCallData callData,
@CachedLibrary("data") PythonBufferAcquireLibrary bufferAcquireLib,
@CachedLibrary(limit = "3") @Shared PythonBufferAccessLibrary bufferLib,
- @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode,
@Cached TruffleString.SwitchEncodingNode switchEncodingNode) {
// equivalent of PyUnicode_DecodeLatin1
Object dataBuffer = bufferAcquireLib.acquireReadonly(data, frame, context, context.getLanguage(inliningTarget), callData);
try {
int len = bufferLib.getBufferLength(dataBuffer);
byte[] src = bufferLib.getInternalOrCopiedByteArray(dataBuffer);
- TruffleString latin1 = fromByteArrayNode.execute(src, 0, len, TruffleString.Encoding.ISO_8859_1, true);
- return switchEncodingNode.execute(latin1, TS_ENCODING);
+ return fromByteArrayNode.execute(src, 0, len, TruffleString.CompactionLevel.S1, true);
} finally {
bufferLib.release(dataBuffer, frame, callData);
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/ErrorHandlers.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/ErrorHandlers.java
index 62c6d0a47a..c7103d9d74 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/ErrorHandlers.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/ErrorHandlers.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -364,8 +364,7 @@ static Object doEncode(PBaseException exception,
@Cached PyUnicodeEncodeOrTranslateErrorGetStartNode getStartNode,
@Cached PyUnicodeEncodeOrTranslateErrorGetEndNode getEndNode,
@Cached TruffleString.CodePointAtIndexUTF32Node codePointAtIndexNode,
- @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
- @Cached TruffleString.SwitchEncodingNode switchEncodingNode) {
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) {
TruffleString src = getObjectNode.execute(inliningTarget, exception);
int start = getStartNode.execute(inliningTarget, exception);
int end = getEndNode.execute(inliningTarget, exception);
@@ -378,8 +377,8 @@ static Object doEncode(PBaseException exception,
for (int i = start; i < end; ++i) {
pos = appendXmlCharRefReplacement(replacement, pos, codePointAtIndexNode.execute(src, i));
}
- TruffleString resultAscii = fromByteArrayNode.execute(replacement, Encoding.US_ASCII, false);
- return PFactory.createTuple(language, new Object[]{switchEncodingNode.execute(resultAscii, TS_ENCODING), end});
+ TruffleString resultAscii = fromByteArrayNode.execute(replacement, 0, replacement.length, TruffleString.CompactionLevel.S1, false);
+ return PFactory.createTuple(language, new Object[]{resultAscii, end});
}
@Specialization(guards = "!isEncode(inliningTarget, o, pyObjectTypeCheck)", limit = "1")
@@ -405,8 +404,7 @@ static Object doDecodeException(VirtualFrame frame, PBaseException exception,
@Cached PyUnicodeDecodeErrorGetEndNode getEndNode,
@CachedLibrary(limit = "3") PythonBufferAcquireLibrary acquireLib,
@CachedLibrary(limit = "3") PythonBufferAccessLibrary accessLib,
- @Cached @Shared TruffleString.FromByteArrayNode fromByteArrayNode,
- @Cached @Shared TruffleString.SwitchEncodingNode switchEncodingNode) {
+ @Cached @Shared TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) {
int start = getStartNode.execute(inliningTarget, exception);
int end = getEndNode.execute(inliningTarget, exception);
Object object = getObjectNode.execute(inliningTarget, exception);
@@ -424,8 +422,8 @@ static Object doDecodeException(VirtualFrame frame, PBaseException exception,
} finally {
accessLib.release(srcBuf, frame, callData);
}
- TruffleString resultAscii = fromByteArrayNode.execute(replacement, Encoding.US_ASCII, false);
- return PFactory.createTuple(language, new Object[]{switchEncodingNode.execute(resultAscii, TS_ENCODING), end});
+ TruffleString resultAscii = fromByteArrayNode.execute(replacement, 0, replacement.length, TruffleString.CompactionLevel.S1, false);
+ return PFactory.createTuple(language, new Object[]{resultAscii, end});
}
@Specialization(guards = "isEncodeOrTranslate(inliningTarget, exception, pyObjectTypeCheck)", limit = "1")
@@ -437,8 +435,7 @@ static Object doEncodeOrTranslateException(PBaseException exception,
@Cached PyUnicodeEncodeOrTranslateErrorGetStartNode getStartNode,
@Cached PyUnicodeEncodeOrTranslateErrorGetEndNode getEndNode,
@Cached TruffleString.CodePointAtIndexUTF32Node codePointAtIndexNode,
- @Cached @Shared TruffleString.FromByteArrayNode fromByteArrayNode,
- @Cached @Shared TruffleString.SwitchEncodingNode switchEncodingNode) {
+ @Cached @Shared TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) {
int start = getStartNode.execute(inliningTarget, exception);
int end = getEndNode.execute(inliningTarget, exception);
TruffleString src = getObjectNode.execute(inliningTarget, exception);
@@ -462,8 +459,8 @@ static Object doEncodeOrTranslateException(PBaseException exception,
int cp = codePointAtIndexNode.execute(src, i);
pos = BytesUtils.unicodeNonAsciiEscape(cp, pos, replacement, true);
}
- TruffleString resultAscii = fromByteArrayNode.execute(replacement, Encoding.US_ASCII, false);
- return PFactory.createTuple(language, new Object[]{switchEncodingNode.execute(resultAscii, TS_ENCODING), end});
+ TruffleString resultAscii = fromByteArrayNode.execute(replacement, 0, replacement.length, TruffleString.CompactionLevel.S1, false);
+ return PFactory.createTuple(language, new Object[]{resultAscii, end});
}
@Specialization(guards = "isNeither(inliningTarget, o, pyObjectTypeCheck)", limit = "1")
@@ -487,8 +484,7 @@ static Object doEncode(PBaseException exception,
@Cached PyUnicodeEncodeOrTranslateErrorGetStartNode getStartNode,
@Cached PyUnicodeEncodeOrTranslateErrorGetEndNode getEndNode,
@Cached TruffleString.CodePointAtIndexUTF32Node codePointAtIndexNode,
- @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
- @Cached TruffleString.SwitchEncodingNode switchEncodingNode,
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode,
@Cached TruffleString.FromJavaStringNode fromJavaStringNode,
@Cached TruffleStringBuilder.AppendStringNode appendStringNode,
@Cached TruffleStringBuilder.AppendCodePointNode appendCodePointNode,
@@ -512,7 +508,7 @@ static Object doEncode(PBaseException exception,
appendCodePointNode.execute(tsb, '}');
} else {
int len = BytesUtils.unicodeNonAsciiEscape(cp, 0, buf, true);
- appendStringNode.execute(tsb, switchEncodingNode.execute(fromByteArrayNode.execute(buf, 0, len, Encoding.US_ASCII, true), TS_ENCODING));
+ appendStringNode.execute(tsb, fromByteArrayNode.execute(buf, 0, len, TruffleString.CompactionLevel.S1, true));
}
}
return PFactory.createTuple(language, new Object[]{toStringNode.execute(tsb), end});
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PPickler.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PPickler.java
index 21d27a0e36..c02ee31098 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PPickler.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PPickler.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -1327,7 +1327,7 @@ private void saveBytes(VirtualFrame frame, PythonContext ctx, PPickler pickler,
reduceValue = createTuple(ctx.getCore().lookupType(PythonBuiltinClassType.PBytes), createTuple());
} else {
PickleState st = getGlobalState(ctx.getCore());
- final TruffleString unicodeStr = PickleUtils.decodeLatin1Strict(getBufferLibrary().getCopiedByteArray(buffer), ensureTsFromByteArray(), ensureTsSwitchEncodingNode());
+ final TruffleString unicodeStr = PickleUtils.decodeLatin1Strict(getBufferLibrary().getCopiedByteArray(buffer), ensureTsFromByteArrayWithCompaction());
reduceValue = createTuple(st.codecsEncode, createTuple(unicodeStr, LATIN1));
}
// save_reduce() will memoize the object automatically.
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PUnpickler.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PUnpickler.java
index 635f4e2020..4c00d8ac08 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PUnpickler.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PUnpickler.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -871,7 +871,7 @@ private void loadInt(VirtualFrame frame, PUnpickler self) {
}
try {
- long x = PickleUtils.asciiBytesToLong(s, ensureTsParseLongNode(), ensureTsFromByteArray());
+ long x = PickleUtils.asciiBytesToLong(s, ensureTsParseLongNode(), ensureTsFromByteArrayWithCompaction());
if (s.length == 3 && (x == 0 || x == 1)) {
value = x != 0;
} else if (x == (int) x) {
@@ -900,7 +900,7 @@ private void loadLong(VirtualFrame frame, PUnpickler self) {
s[s.length - 2] = 0;
}
try {
- value = PickleUtils.asciiBytesToLong(s, ensureTsParseLongNode(), ensureTsFromByteArray());
+ value = PickleUtils.asciiBytesToLong(s, ensureTsParseLongNode(), ensureTsFromByteArrayWithCompaction());
} catch (TruffleString.NumberFormatException nfe) {
value = parseInt(s);
}
@@ -1505,7 +1505,7 @@ private void loadGet(VirtualFrame frame, PUnpickler self) {
}
int idx;
try {
- idx = PickleUtils.asciiBytesToInt(s, ensureTsParseIntNode(), ensureTsFromByteArray());
+ idx = PickleUtils.asciiBytesToInt(s, ensureTsParseIntNode(), ensureTsFromByteArrayWithCompaction());
} catch (TruffleString.NumberFormatException nfe) {
// TODO handle exception [GR-38101]
throw CompilerDirectives.shouldNotReachHere();
@@ -1572,7 +1572,7 @@ private void loadPut(VirtualFrame frame, PUnpickler self) {
Object value = self.stack.data[self.stack.size - 1];
int idx;
try {
- idx = PickleUtils.asciiBytesToInt(s, ensureTsParseIntNode(), ensureTsFromByteArray());
+ idx = PickleUtils.asciiBytesToInt(s, ensureTsParseIntNode(), ensureTsFromByteArrayWithCompaction());
} catch (TruffleString.NumberFormatException nfe) {
// TODO handle exception [GR-38101]
throw CompilerDirectives.shouldNotReachHere();
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PickleUtils.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PickleUtils.java
index 683df9bc07..276a5c3d0c 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PickleUtils.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PickleUtils.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -257,21 +257,21 @@ public static int getStringSize(byte[] bytes) {
}
public static TruffleString getValidIntString(byte[] bytes) {
- return getValidIntASCIIString(bytes, TruffleString.FromByteArrayNode.getUncached());
+ return getValidIntASCIIString(bytes, TruffleString.FromByteArrayWithCompactionUTF32Node.getUncached());
}
- public static int asciiBytesToInt(byte[] bytes, TruffleString.ParseIntNode parseIntNode, TruffleString.FromByteArrayNode fromByteArrayNode)
+ public static int asciiBytesToInt(byte[] bytes, TruffleString.ParseIntNode parseIntNode, TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode)
throws TruffleString.NumberFormatException {
return parseIntNode.execute(getValidIntASCIIString(bytes, fromByteArrayNode), 10);
}
- public static long asciiBytesToLong(byte[] bytes, TruffleString.ParseLongNode parseLongNode, TruffleString.FromByteArrayNode fromByteArrayNode)
+ public static long asciiBytesToLong(byte[] bytes, TruffleString.ParseLongNode parseLongNode, TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode)
throws TruffleString.NumberFormatException {
return parseLongNode.execute(getValidIntASCIIString(bytes, fromByteArrayNode), 10);
}
- private static TruffleString getValidIntASCIIString(byte[] bytes, TruffleString.FromByteArrayNode fromByteArray) {
- return fromByteArray.execute(bytes, 0, getStringSize(bytes), TruffleString.Encoding.US_ASCII, true);
+ private static TruffleString getValidIntASCIIString(byte[] bytes, TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArray) {
+ return fromByteArray.execute(bytes, 0, getStringSize(bytes), TruffleString.CompactionLevel.S1, true);
}
@TruffleBoundary
@@ -316,8 +316,8 @@ public static TruffleString decodeUTF8Strict(byte[] data, int len, TruffleString
return decodeStrict(data, len, fromByteArrayNode, TruffleString.Encoding.UTF_8, switchEncodingNode);
}
- public static TruffleString decodeLatin1Strict(byte[] data, TruffleString.FromByteArrayNode fromByteArrayNode, TruffleString.SwitchEncodingNode switchEncodingNode) {
- return decodeStrict(data, data.length, fromByteArrayNode, TruffleString.Encoding.ISO_8859_1, switchEncodingNode);
+ public static TruffleString decodeLatin1Strict(byte[] data, TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) {
+ return fromByteArrayNode.execute(data, 0, data.length, TruffleString.CompactionLevel.S1, true);
}
private static TruffleString decodeStrict(byte[] data, int len, TruffleString.FromByteArrayNode fromByteArrayNode, TruffleString.Encoding encoding,
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PicklerNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PicklerNodes.java
index e838b83e36..6e38abab05 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PicklerNodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PicklerNodes.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -152,6 +152,7 @@ abstract static class BasePickleNode extends Node {
@Child private BytesNodes.ToBytesNode toBytesNode;
@Child private PyObjectReprAsTruffleStringNode reprNode;
@Child private TruffleString.FromByteArrayNode tsFromByteArrayNode;
+ @Child private TruffleString.FromByteArrayWithCompactionUTF32Node tsFromByteArrayWithCompactionNode;
@Child private TruffleString.CodePointLengthNode tsCodePointLengthNode;
@Child private TruffleString.CodePointAtIndexUTF32Node tsCodePointAtIndexUTF32Node;
@Child private TruffleString.FromLongNode tsFromLongNode;
@@ -190,6 +191,14 @@ protected TruffleString.FromByteArrayNode ensureTsFromByteArray() {
return tsFromByteArrayNode;
}
+ protected TruffleString.FromByteArrayWithCompactionUTF32Node ensureTsFromByteArrayWithCompaction() {
+ if (tsFromByteArrayWithCompactionNode == null) {
+ CompilerDirectives.transferToInterpreterAndInvalidate();
+ tsFromByteArrayWithCompactionNode = insert(TruffleString.FromByteArrayWithCompactionUTF32Node.create());
+ }
+ return tsFromByteArrayWithCompactionNode;
+ }
+
protected TruffleString.CodePointLengthNode ensureTsCodePointLengthNode() {
if (tsCodePointLengthNode == null) {
CompilerDirectives.transferToInterpreterAndInvalidate();
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesNodes.java
index 174262daa0..8daf58a4f1 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesNodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesNodes.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -40,7 +40,6 @@
*/
package com.oracle.graal.python.builtins.objects.bytes;
-import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.createASCIIString;
import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.isSpace;
import static com.oracle.graal.python.builtins.objects.cext.structs.CFields.PyBytesObject__ob_sval;
import static com.oracle.graal.python.builtins.objects.cext.structs.CFields.PyVarObject__ob_size;
@@ -707,8 +706,7 @@ public abstract static class ByteToHexNode extends PNodeWithContext {
@Specialization(guards = "bytesPerSepGroup == 0")
static TruffleString zero(byte[] argbuf, int arglen, @SuppressWarnings("unused") byte sep, @SuppressWarnings("unused") int bytesPerSepGroup,
- @Shared @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
- @Shared @Cached TruffleString.SwitchEncodingNode switchEncodingNode) {
+ @Shared @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) {
int resultlen = arglen * 2;
byte[] retbuf = new byte[resultlen];
@@ -719,14 +717,13 @@ static TruffleString zero(byte[] argbuf, int arglen, @SuppressWarnings("unused")
retbuf[j++] = BytesUtils.HEXDIGITS[c >>> 4];
retbuf[j++] = BytesUtils.HEXDIGITS[c & 0x0f];
}
- return createASCIIString(retbuf, fromByteArrayNode, switchEncodingNode);
+ return fromByteArrayNode.execute(retbuf, 0, retbuf.length, TruffleString.CompactionLevel.S1, false);
}
@Specialization(guards = "bytesPerSepGroup < 0")
static TruffleString negative(Node inliningTarget, byte[] argbuf, int arglen, byte sep, int bytesPerSepGroup,
@Shared @Cached InlinedConditionProfile earlyExit,
- @Shared @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
- @Shared @Cached TruffleString.SwitchEncodingNode switchEncodingNode,
+ @Shared @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode,
@Shared @Cached PRaiseNode raiseNode) {
if (earlyExit.profile(inliningTarget, arglen == 0)) {
return T_EMPTY_STRING;
@@ -741,7 +738,7 @@ static TruffleString negative(Node inliningTarget, byte[] argbuf, int arglen, by
resultlen += arglen * 2;
if (absBytesPerSepGroup >= arglen) {
- return zero(argbuf, arglen, sep, 0, fromByteArrayNode, switchEncodingNode);
+ return zero(argbuf, arglen, sep, 0, fromByteArrayNode);
}
byte[] retbuf = new byte[resultlen];
@@ -761,14 +758,13 @@ static TruffleString negative(Node inliningTarget, byte[] argbuf, int arglen, by
retbuf[j++] = BytesUtils.HEXDIGITS[c & 0x0f];
}
- return createASCIIString(retbuf, fromByteArrayNode, switchEncodingNode);
+ return fromByteArrayNode.execute(retbuf, 0, retbuf.length, TruffleString.CompactionLevel.S1, false);
}
@Specialization(guards = "absBytesPerSepGroup > 0")
static TruffleString positive(Node inliningTarget, byte[] argbuf, int arglen, byte sep, int absBytesPerSepGroup,
@Shared @Cached InlinedConditionProfile earlyExit,
- @Shared @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
- @Shared @Cached TruffleString.SwitchEncodingNode switchEncodingNode,
+ @Shared @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode,
@Shared @Cached PRaiseNode raiseNode) {
if (earlyExit.profile(inliningTarget, arglen == 0)) {
return T_EMPTY_STRING;
@@ -783,7 +779,7 @@ static TruffleString positive(Node inliningTarget, byte[] argbuf, int arglen, by
resultlen += arglen * 2;
if (absBytesPerSepGroup >= arglen) {
- return zero(argbuf, arglen, sep, 0, fromByteArrayNode, switchEncodingNode);
+ return zero(argbuf, arglen, sep, 0, fromByteArrayNode);
}
byte[] retbuf = new byte[resultlen];
@@ -803,7 +799,7 @@ static TruffleString positive(Node inliningTarget, byte[] argbuf, int arglen, by
retbuf[j--] = BytesUtils.HEXDIGITS[c & 0x0f];
retbuf[j--] = BytesUtils.HEXDIGITS[c >>> 4];
}
- return createASCIIString(retbuf, fromByteArrayNode, switchEncodingNode);
+ return fromByteArrayNode.execute(retbuf, 0, retbuf.length, TruffleString.CompactionLevel.S1, false);
}
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java
index 8b6c1f2bb2..9906e17073 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2025, Oracle and/or its affiliates.
+ * Copyright (c) 2017, 2026, Oracle and/or its affiliates.
* Copyright (c) 2014, Regents of the University of California
*
* All rights reserved.
@@ -25,14 +25,11 @@
*/
package com.oracle.graal.python.builtins.objects.bytes;
-import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
-
import java.io.ByteArrayOutputStream;
import com.oracle.truffle.api.CompilerAsserts;
import com.oracle.truffle.api.CompilerDirectives.CompilationFinal;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
-import com.oracle.truffle.api.strings.TruffleString;
import com.oracle.truffle.api.strings.TruffleStringBuilder;
import com.oracle.truffle.api.strings.TruffleStringBuilderUTF32;
@@ -473,11 +470,6 @@ public static int digitValue(byte hexChar) {
return 37;
}
- @TruffleBoundary
- public static TruffleString createASCIIString(byte[] retbuf, TruffleString.FromByteArrayNode fromByteArrayNode, TruffleString.SwitchEncodingNode switchEncodingNode) {
- return switchEncodingNode.execute(fromByteArrayNode.execute(retbuf, TruffleString.Encoding.US_ASCII), TS_ENCODING);
- }
-
@TruffleBoundary
public static ByteArrayOutputStream createOutputStream() {
return new ByteArrayOutputStream();
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeStringData.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeStringData.java
index abfd18d625..c1ca2d8d36 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeStringData.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeStringData.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2025, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -80,15 +80,14 @@ public int length() {
return storage.length();
}
- public TruffleString toTruffleString(TruffleString.FromNativePointerNode fromNativePointerNode) {
- TruffleString.Encoding encoding = switch (kind) {
- case KIND_ASCII -> TruffleString.Encoding.US_ASCII;
- case KIND_1BYTE -> TruffleString.Encoding.ISO_8859_1;
- case KIND_2BYTE -> TruffleString.Encoding.UTF_16;
- case KIND_4BYTE -> TruffleString.Encoding.UTF_32;
+ public TruffleString toTruffleString(TruffleString.FromNativePointerWithCompactionUTF32Node fromNativePointerNode) {
+ TruffleString.CompactionLevel compactionLevel = switch (kind) {
+ case KIND_ASCII, KIND_1BYTE -> TruffleString.CompactionLevel.S1;
+ case KIND_2BYTE -> TruffleString.CompactionLevel.S2;
+ case KIND_4BYTE -> TruffleString.CompactionLevel.S4;
default -> throw CompilerDirectives.shouldNotReachHere();
};
// NativeByteSequenceStorage implements asPointer
- return fromNativePointerNode.execute(storage, 0, storage.length(), encoding, false);
+ return fromNativePointerNode.execute(storage, 0, storage.length(), compactionLevel, false);
}
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java
index 6b7afc2514..f082764b07 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java
@@ -892,7 +892,7 @@ static TruffleString lowerAscii(TruffleString self,
@Cached TruffleString.SwitchEncodingNode switchEncodingNode,
@Cached TruffleString.ByteIndexOfCodePointSetNode indexOfCodePointSetNode,
@Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode,
- @Cached TruffleString.FromByteArrayNode fromByteArrayNode) {
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) {
TruffleString ascii = switchEncodingNode.execute(self, Encoding.US_ASCII);
int i = indexOfCodePointSetNode.execute(ascii, 0, ascii.byteLength(Encoding.US_ASCII), ASCII_UPPER);
if (i < 0) {
@@ -905,7 +905,7 @@ static TruffleString lowerAscii(TruffleString self,
buf[i] = (byte) (buf[i] - 'A' + 'a');
}
}
- return switchEncodingNode.execute(fromByteArrayNode.execute(buf, Encoding.US_ASCII, false), TS_ENCODING);
+ return fromByteArrayNode.execute(buf, 0, buf.length, TruffleString.CompactionLevel.S1, false);
}
@Specialization(guards = "!isAscii(self, getCodeRangeNode)")
@@ -939,7 +939,7 @@ static TruffleString upperAscii(TruffleString self,
@Cached TruffleString.SwitchEncodingNode switchEncodingNode,
@Cached TruffleString.ByteIndexOfCodePointSetNode indexOfCodePointSetNode,
@Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode,
- @Cached TruffleString.FromByteArrayNode fromByteArrayNode) {
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) {
TruffleString ascii = switchEncodingNode.execute(self, Encoding.US_ASCII);
int i = indexOfCodePointSetNode.execute(ascii, 0, ascii.byteLength(Encoding.US_ASCII), ASCII_LOWER);
if (i < 0) {
@@ -952,7 +952,7 @@ static TruffleString upperAscii(TruffleString self,
buf[i] = (byte) (buf[i] - 'a' + 'A');
}
}
- return switchEncodingNode.execute(fromByteArrayNode.execute(buf, Encoding.US_ASCII, false), TS_ENCODING);
+ return fromByteArrayNode.execute(buf, 0, buf.length, TruffleString.CompactionLevel.S1, false);
}
@Specialization(guards = "!isAscii(self, getCodeRangeNode)")
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringNodes.java
index 1f7217c5bd..2d40036a59 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringNodes.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringNodes.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -125,7 +125,7 @@ static TruffleString doMaterialized(PString x) {
@InliningCutoff
static TruffleString doNative(Node inliningTarget, PString x,
@Cached HiddenAttr.ReadNode readAttrNode,
- @Cached TruffleString.FromNativePointerNode fromNativePointerNode) {
+ @Cached TruffleString.FromNativePointerWithCompactionUTF32Node fromNativePointerNode) {
NativeStringData nativeData = x.getNativeStringData(inliningTarget, readAttrNode);
TruffleString materialized = nativeData.toTruffleString(fromNativePointerNode);
x.setMaterialized(materialized);
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/struct/StructBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/struct/StructBuiltins.java
index 48faca88dc..cef73d8aa4 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/struct/StructBuiltins.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/struct/StructBuiltins.java
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020, 2025, Oracle and/or its affiliates.
+/* Copyright (c) 2020, 2026, Oracle and/or its affiliates.
* Copyright (C) 1996-2020 Python Software Foundation
*
* Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
@@ -63,7 +63,6 @@
import static com.oracle.graal.python.nodes.ErrorMessages.UNPACK_REQ_A_BUFFER_OF_N_BYTES;
import static com.oracle.graal.python.runtime.exception.PythonErrorType.StructError;
import static com.oracle.graal.python.runtime.exception.PythonErrorType.TypeError;
-import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
import java.nio.ByteOrder;
import java.util.HashSet;
@@ -678,9 +677,9 @@ protected Object get(PStruct self) {
public abstract static class GetStructFormat extends PythonBuiltinNode {
@Specialization
protected Object get(PStruct self,
- @Cached TruffleString.FromByteArrayNode fromBytes,
- @Cached TruffleString.SwitchEncodingNode switchEncoding) {
- return switchEncoding.execute(fromBytes.execute(self.getFormat(), TruffleString.Encoding.US_ASCII), TS_ENCODING);
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromBytes) {
+ byte[] format = self.getFormat();
+ return fromBytes.execute(format, 0, format.length, TruffleString.CompactionLevel.S1, false);
}
}
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyNumberLongNode.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyNumberLongNode.java
index 831553f7ed..fe4e005d79 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyNumberLongNode.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyNumberLongNode.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2025, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -44,7 +44,6 @@
import static com.oracle.graal.python.nodes.SpecialMethodNames.J___INT__;
import static com.oracle.graal.python.nodes.SpecialMethodNames.J___TRUNC__;
import static com.oracle.graal.python.nodes.SpecialMethodNames.T___TRUNC__;
-import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
import com.oracle.graal.python.builtins.PythonBuiltinClassType;
import com.oracle.graal.python.builtins.modules.WarningsModuleBuiltins;
@@ -241,8 +240,7 @@ public abstract static class LongFromBufferNode extends Node {
@InliningCutoff
static Object doGeneric(VirtualFrame frame, Object object, int base,
@Bind Node inliningTarget,
- @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
- @Cached TruffleString.SwitchEncodingNode switchEncodingNode,
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode,
@Cached PyLongFromUnicodeObject fromString,
@Cached(value = "createFor($node)") InteropCallData callData,
@CachedLibrary(limit = "3") PythonBufferAcquireLibrary acquireLib,
@@ -256,8 +254,7 @@ static Object doGeneric(VirtualFrame frame, Object object, int base,
try {
byte[] bytes = bufferLib.getInternalOrCopiedByteArray(buffer);
int len = bufferLib.getBufferLength(buffer);
- TruffleString string = fromByteArrayNode.execute(bytes, 0, len, TruffleString.Encoding.US_ASCII, false);
- string = switchEncodingNode.execute(string, TS_ENCODING);
+ TruffleString string = fromByteArrayNode.execute(bytes, 0, len, TruffleString.CompactionLevel.S1, false);
return fromString.execute(inliningTarget, string, base, bytes, len);
} finally {
bufferLib.release(buffer);
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyObjectAsciiNode.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyObjectAsciiNode.java
index f1d4063e88..ced7a5f676 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyObjectAsciiNode.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyObjectAsciiNode.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -54,7 +54,6 @@
import com.oracle.truffle.api.frame.VirtualFrame;
import com.oracle.truffle.api.nodes.Node;
import com.oracle.truffle.api.strings.TruffleString;
-import com.oracle.truffle.api.strings.TruffleString.Encoding;
import com.oracle.truffle.api.strings.TruffleStringIterator;
/**
@@ -81,8 +80,7 @@ public static TruffleString ascii(VirtualFrame frame, Node inliningTarget, Objec
@Cached TruffleString.CreateCodePointIteratorNode createCodePointIteratorNode,
@Cached TruffleStringIterator.NextNode nextNode,
@Cached TruffleString.CodePointLengthNode codePointLengthNode,
- @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
- @Cached TruffleString.SwitchEncodingNode switchEncodingNode) {
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) {
// TODO GR-37220: rewrite using TruffleStringBuilder?
TruffleString repr = reprNode.execute(frame, inliningTarget, obj);
if (getCodeRangeNode.execute(repr, TS_ENCODING) == TruffleString.CodeRange.ASCII) {
@@ -95,7 +93,7 @@ public static TruffleString ascii(VirtualFrame frame, Node inliningTarget, Objec
int ch = nextNode.execute(it, TS_ENCODING);
j = unicodeNonAsciiEscape(ch, j, bytes);
}
- return switchEncodingNode.execute(fromByteArrayNode.execute(bytes, 0, j, Encoding.US_ASCII, true), TS_ENCODING);
+ return fromByteArrayNode.execute(bytes, 0, j, TruffleString.CompactionLevel.S1, true);
}
@NeverDefault
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/util/CastToTruffleStringNode.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/util/CastToTruffleStringNode.java
index b0d2b72d56..2d97f52b58 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/util/CastToTruffleStringNode.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/util/CastToTruffleStringNode.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -72,7 +72,6 @@
import com.oracle.truffle.api.library.CachedLibrary;
import com.oracle.truffle.api.nodes.Node;
import com.oracle.truffle.api.strings.TruffleString;
-import com.oracle.truffle.api.strings.TruffleString.Encoding;
/**
* Casts a Python string to a TruffleString without coercion. ATTENTION: If the cast fails,
@@ -144,31 +143,31 @@ static TruffleString read(Object pointer,
@Cached CStructAccess.ReadPointerNode readPointer,
@Cached CStructAccess.ReadByteNode readByte,
@CachedLibrary(limit = "3") InteropLibrary lib,
- @Cached TruffleString.FromNativePointerNode fromNative,
- @Cached TruffleString.FromByteArrayNode fromBytes) {
+ @Cached TruffleString.FromNativePointerWithCompactionUTF32Node fromNative,
+ @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromBytes) {
int state = readI32.read(pointer, PyASCIIObject__state);
int kind = (state >> CFields.PyASCIIObject__state_kind_shift) & 0x7;
Object data = readPointer.read(pointer, PyUnicodeObject__data);
long length = readI64.read(pointer, PyASCIIObject__length);
- Encoding encoding;
+ TruffleString.CompactionLevel compactionLevel;
if (kind == 1) {
// isBitSet(state, PyASCIIObject__state_ascii_shift))
// ascii doesn't matter, codepoint 0-127 are the same in ascii and latin1
- encoding = Encoding.ISO_8859_1;
+ compactionLevel = TruffleString.CompactionLevel.S1;
} else if (kind == 2) {
- encoding = Encoding.UTF_16LE;
+ compactionLevel = TruffleString.CompactionLevel.S2;
} else {
assert kind == 4;
- encoding = Encoding.UTF_32LE;
+ compactionLevel = TruffleString.CompactionLevel.S4;
}
int bytes = PythonUtils.toIntError(length * kind);
if (lib.isPointer(data) || data instanceof Long) {
- return fromNative.execute(data, 0, bytes, encoding, false);
+ return fromNative.execute(data, 0, bytes, compactionLevel, false);
}
byte[] result = readByte.readByteArray(data, bytes);
- return fromBytes.execute(result, encoding, false);
+ return fromBytes.execute(result, 0, result.length, compactionLevel, false);
}
}
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
index b1b36a4ec3..536aa98596 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java
@@ -118,6 +118,7 @@
import com.oracle.graal.python.util.FunctionWithSignature;
import com.oracle.graal.python.util.OverflowException;
import com.oracle.graal.python.util.PythonUtils;
+import com.oracle.truffle.api.ArrayUtils;
import com.oracle.truffle.api.CompilerAsserts;
import com.oracle.truffle.api.CompilerDirectives;
import com.oracle.truffle.api.CompilerDirectives.CompilationFinal;
@@ -1919,7 +1920,8 @@ public TruffleString crypt(TruffleString word, TruffleString salt,
@Shared("invoke") @Cached InvokeNativeFunction invokeNode,
@Shared("toUtf8") @Cached TruffleString.SwitchEncodingNode switchEncodingToUtf8Node,
@Shared("tsCopyBytes") @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode,
- @Shared("tsFromBytes") @Cached TruffleString.FromByteArrayNode fromByteArrayNode,
+ @Cached TruffleString.FromZeroTerminatedNativePointerNode fromZeroTerminatedNativePointerNode,
+ @Cached TruffleString.AsManagedNode asManagedNode,
@Shared("fromUtf8") @Cached TruffleString.SwitchEncodingNode switchEncodingFromUtf8Node) throws PosixException {
/*
* We don't want to link the posix library with libcrypt, because it might not be available
@@ -1963,13 +1965,9 @@ public TruffleString crypt(TruffleString word, TruffleString salt,
if (resultPtr == 0) {
throw getErrnoAndThrowPosixException(invokeNode);
}
- int len = 0;
- while (UNSAFE.getByte(resultPtr + len) != 0) {
- len++;
- }
- byte[] resultBytes = new byte[len];
- UNSAFE.copyMemory(null, resultPtr, resultBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, len);
- return createString(resultBytes, 0, resultBytes.length, false, fromByteArrayNode, switchEncodingFromUtf8Node);
+ // TODO PyUnicode_DecodeFSDefault
+ TruffleString utf8 = fromZeroTerminatedNativePointerNode.execute8Bit(resultPtr, 0, UTF_8, false);
+ return asManagedNode.execute(switchEncodingFromUtf8Node.execute(utf8, TS_ENCODING), TS_ENCODING);
}
}
@@ -2276,13 +2274,7 @@ UnixSockAddr asUnixSockAddr() {
pathBuf = PythonUtils.arrayCopyOfRange(data, pathOffset, pathOffset + linuxAddrLen);
} else {
// Regular NULL-terminated string
- int pathLen = -1;
- for (int i = pathOffset; i < data.length; i++) {
- if (data[i] == '\0') {
- pathLen = i - pathOffset;
- break;
- }
- }
+ int pathLen = ArrayUtils.indexOf(data, pathOffset, data.length, (byte) 0) - pathOffset;
assert pathLen >= 0;
pathBuf = PythonUtils.arrayCopyOfRange(data, pathOffset, pathOffset + pathLen);
}
@@ -2546,11 +2538,8 @@ private static TruffleString extractZeroTerminatedString(byte[] buffer, long lon
throw outOfMemoryPosixError();
}
int offset = (int) longOffset;
- int end = offset;
- while (end < buffer.length && buffer[end] != '\0') {
- end++;
- }
- if (end == buffer.length) {
+ int end = ArrayUtils.indexOf(buffer, offset, buffer.length, (byte) 0);
+ if (end < 0) {
throw CompilerDirectives.shouldNotReachHere("Could not find the end of the string");
}
// TODO PyUnicode_DecodeFSDefault
diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java
index ce80616b39..44271cac80 100644
--- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java
+++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
@@ -43,7 +43,6 @@
import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
import static com.oracle.graal.python.util.PythonUtils.toInternedTruffleStringUncached;
import static com.oracle.graal.python.util.PythonUtils.toTruffleStringUncached;
-import static com.oracle.graal.python.util.PythonUtils.tsLiteral;
import java.nio.ByteOrder;
import java.nio.charset.Charset;
@@ -76,45 +75,40 @@
* Utility class for mapping Python encodings to Java charsets
*/
public class CharsetMapping {
- private static final Charset UTF_32LE = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE"), ByteOrder.LITTLE_ENDIAN);
- private static final Charset UTF_32LE_BOM = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE-BOM"), ByteOrder.LITTLE_ENDIAN);
- private static final Charset UTF_32BE = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE"), ByteOrder.BIG_ENDIAN);
- private static final Charset UTF_32BE_BOM = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE-BOM"), ByteOrder.BIG_ENDIAN);
- private static final ConcurrentMap JAVA_CHARSETS = new ConcurrentHashMap<>();
+
+ public enum BOM {
+ UTF_16LE(new byte[]{(byte) 0xff, (byte) 0xfe}),
+ UTF_16BE(new byte[]{(byte) 0xfe, (byte) 0xff}),
+ UTF_32LE(new byte[]{(byte) 0xff, (byte) 0xfe, 0, 0}),
+ UTF_32BE(new byte[]{0, 0, (byte) 0xfe, (byte) 0xff});
+
+ public final byte[] bytes;
+
+ BOM(byte[] bytes) {
+ this.bytes = bytes;
+ }
+ }
+
+ public static final CharsetWrapper UTF_16LE_BOM = new CharsetWrapper(Charset.forName("UnicodeLittle"), TruffleString.Encoding.UTF_16LE, BOM.UTF_16LE, 2);
+ public static final CharsetWrapper UTF_16BE_BOM = new CharsetWrapper(StandardCharsets.UTF_16, TruffleString.Encoding.UTF_16BE, BOM.UTF_16BE, 2);
+
+ public static final CharsetWrapper UTF_32LE = new CharsetWrapper(new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE"), ByteOrder.LITTLE_ENDIAN), TruffleString.Encoding.UTF_32LE, null, 4);
+ public static final CharsetWrapper UTF_32LE_BOM = new CharsetWrapper(new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE-BOM"), ByteOrder.LITTLE_ENDIAN), TruffleString.Encoding.UTF_32LE,
+ BOM.UTF_32LE, 4);
+ public static final CharsetWrapper UTF_32BE = new CharsetWrapper(new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE"), ByteOrder.BIG_ENDIAN), TruffleString.Encoding.UTF_32BE, null, 4);
+ public static final CharsetWrapper UTF_32BE_BOM = new CharsetWrapper(new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE-BOM"), ByteOrder.BIG_ENDIAN), TruffleString.Encoding.UTF_32BE,
+ BOM.UTF_32BE, 4);
+
+ private static final ConcurrentMap JAVA_CHARSETS = new ConcurrentHashMap<>();
// Name maps are populated by static initializer and are immutable afterwards
private static final Map CHARSET_NAME_MAP = new HashMap<>();
private static final Map CHARSET_NAME_MAP_REVERSE = new HashMap<>();
- private static final TruffleString T_UTF_16_UNDERSCORE = tsLiteral("utf_16");
- private static final TruffleString T_UTF_32_UNDERSCORE = tsLiteral("utf_32");
- @TruffleBoundary
- public static Charset getCharsetNormalized(TruffleString normalizedEncoding) {
- String name = CHARSET_NAME_MAP.get(normalizedEncoding);
- if (name != null) {
- return getJavaCharset(name);
- }
- return null;
+ public record CharsetWrapper(Charset charset, TruffleString.Encoding tStringEncoding, BOM bom, int stride) {
}
@TruffleBoundary
- public static Charset getCharsetForDecodingNormalized(TruffleString normalizedEncoding, byte[] bytes, int len) {
- if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN) {
- /*
- * JDK's charsets for UTF-16 and UTF-32 default to big endian irrespective of the
- * platform if there is no BOM. The UTF-16-LE and UTF-32-LE charsets reject big endian
- * BOM. CPython defaults to platform endian and accepts both BOMs. So, in order to get
- * the behavior we need, we have to take a peek at the possible BOM and if it has a BOM
- * use the UTF-16/32 encoding and let it detect, otherwise default to UTF-16/32-LE.
- */
- if (T_UTF_16_UNDERSCORE.equalsUncached(normalizedEncoding, TS_ENCODING) && hasUTF16BOM(bytes, len)) {
- return StandardCharsets.UTF_16;
- } else if (T_UTF_32_UNDERSCORE.equalsUncached(normalizedEncoding, TS_ENCODING)) {
- Charset charset = getUTF32CharsetForBOM(bytes, len);
- if (charset != null) {
- return charset;
- }
- }
- }
+ public static CharsetWrapper getCharsetNormalized(TruffleString normalizedEncoding) {
String name = CHARSET_NAME_MAP.get(normalizedEncoding);
if (name != null) {
return getJavaCharset(name);
@@ -122,28 +116,6 @@ public static Charset getCharsetForDecodingNormalized(TruffleString normalizedEn
return null;
}
- private static boolean hasUTF16BOM(byte[] bytes, int len) {
- if (len < 2) {
- return false;
- }
- short head = PythonUtils.ARRAY_ACCESSOR.getShort(bytes, 0);
- return head == (short) 0xFFFE || head == (short) 0xFEFF;
- }
-
- private static Charset getUTF32CharsetForBOM(byte[] bytes, int len) {
- if (len < 4) {
- return null;
- }
- int head = PythonUtils.ARRAY_ACCESSOR.getInt(bytes, 0);
- if (head == 0xFFFE0000) {
- return UTF_32BE_BOM;
- }
- if (head == 0x0000FEFF) {
- return UTF_32LE_BOM;
- }
- return null;
- }
-
@TruffleBoundary
public static TruffleString getPythonEncodingNameFromJavaName(String javaEncodingName) {
return CHARSET_NAME_MAP_REVERSE.get(javaEncodingName.toLowerCase());
@@ -188,19 +160,19 @@ public static TruffleString normalizeUncached(TruffleString encoding) {
return NormalizeEncodingNameNodeGen.getUncached().execute(null, encoding);
}
- public static Charset getJavaCharset(String name) {
+ public static CharsetWrapper getJavaCharset(String name) {
return JAVA_CHARSETS.computeIfAbsent(name, key -> {
// Important: When adding additional ICU4J charset, the implementation class needs to be
// added to reflect-config.json
if (name.equals("UTF-7") || name.equals("HZ")) {
try {
- return CharsetICU.forNameICU(name);
+ return new CharsetWrapper(CharsetICU.forNameICU(name), null, null, 1);
} catch (UnsupportedCharsetException e) {
return null;
}
} else {
try {
- return Charset.forName(name);
+ return new CharsetWrapper(Charset.forName(name), null, null, 1);
} catch (UnsupportedCharsetException e) {
return null;
}
@@ -224,12 +196,12 @@ private static void addAlias(String alias, String pythonName) {
static {
// Pre-initialize standard charset entries
- JAVA_CHARSETS.put("US-ASCII", StandardCharsets.US_ASCII);
- JAVA_CHARSETS.put("ISO-8859-1", StandardCharsets.ISO_8859_1);
- JAVA_CHARSETS.put("UTF-8", StandardCharsets.UTF_8);
- JAVA_CHARSETS.put("UTF-16BE", StandardCharsets.UTF_16BE);
- JAVA_CHARSETS.put("UTF-16LE", StandardCharsets.UTF_16LE);
- JAVA_CHARSETS.put("UTF-16", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? Charset.forName("UnicodeLittle") : StandardCharsets.UTF_16);
+ JAVA_CHARSETS.put("US-ASCII", new CharsetWrapper(StandardCharsets.US_ASCII, TruffleString.Encoding.US_ASCII, null, 1));
+ JAVA_CHARSETS.put("ISO-8859-1", new CharsetWrapper(StandardCharsets.ISO_8859_1, TruffleString.Encoding.ISO_8859_1, null, 1));
+ JAVA_CHARSETS.put("UTF-8", new CharsetWrapper(StandardCharsets.UTF_8, TruffleString.Encoding.UTF_8, null, 1));
+ JAVA_CHARSETS.put("UTF-16BE", new CharsetWrapper(StandardCharsets.UTF_16BE, TruffleString.Encoding.UTF_16BE, null, 2));
+ JAVA_CHARSETS.put("UTF-16LE", new CharsetWrapper(StandardCharsets.UTF_16LE, TruffleString.Encoding.UTF_16LE, null, 2));
+ JAVA_CHARSETS.put("UTF-16", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? UTF_16LE_BOM : UTF_16BE_BOM);
JAVA_CHARSETS.put("UTF-32BE", UTF_32BE);
JAVA_CHARSETS.put("UTF-32LE", UTF_32LE);
JAVA_CHARSETS.put("UTF-32", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? UTF_32LE_BOM : UTF_32BE_BOM);
@@ -238,8 +210,8 @@ private static void addAlias(String alias, String pythonName) {
addMapping("raw_unicode_escape", "x-python-raw-unicode-escape");
addMapping("unicode-escape", "x-python-unicode-escape");
addMapping("unicodeescape", "x-python-unicode-escape");
- JAVA_CHARSETS.put("x-python-raw-unicode-escape", new PythonRawUnicodeEscapeCharset());
- JAVA_CHARSETS.put("x-python-unicode-escape", new PythonUnicodeEscapeCharset());
+ JAVA_CHARSETS.put("x-python-raw-unicode-escape", new CharsetWrapper(new PythonRawUnicodeEscapeCharset(), null, null, 1));
+ JAVA_CHARSETS.put("x-python-unicode-escape", new CharsetWrapper(new PythonUnicodeEscapeCharset(), null, null, 1));
addMapping("ascii", "US-ASCII");
addMapping("big5hkscs", "Big5-HKSCS");