diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java index cfff664162..c9755764a6 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/PythonFileDetector.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -103,11 +103,11 @@ private static Charset tryGetCharsetFromLine(String line, boolean hasBOM) { if (hasBOM && !normalizedEncoding.equalsUncached(T_UTF_UNDERSCORE_8, TS_ENCODING)) { throw new InvalidEncodingException(encoding + " with BOM"); } - Charset charset = CharsetMapping.getCharsetNormalized(normalizedEncoding); + CharsetMapping.CharsetWrapper charset = CharsetMapping.getCharsetNormalized(normalizedEncoding); if (charset == null) { throw new InvalidEncodingException(encoding); } - return charset; + return charset.charset(); } return null; } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java index b6810489fd..edc79b1f4f 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/BuiltinFunctions.java @@ -1198,7 +1198,7 @@ TruffleString sourceAsString(VirtualFrame frame, Node inliningTarget, Object sou private TruffleString doDecodeSource(Object source, TruffleString filename, byte[] bytes, int bytesLen) { Charset charset = PythonFileDetector.findEncodingStrict(bytes, bytesLen); TruffleString pythonEncoding = CharsetMapping.getPythonEncodingNameFromJavaName(charset.name()); - CodecsModuleBuiltins.TruffleDecoder decoder = new CodecsModuleBuiltins.TruffleDecoder(pythonEncoding, charset, bytes, bytesLen, CodingErrorAction.REPORT); + CodecsModuleBuiltins.TruffleDecoder decoder = new CodecsModuleBuiltins.TruffleDecoder(charset, bytes, bytesLen, CodingErrorAction.REPORT); if (!decoder.decodingStep(true)) { int pos = decoder.getInputPosition(); Object exception = CallNode.executeUncached(PythonBuiltinClassType.UnicodeDecodeError, pythonEncoding, source, pos, pos + decoder.getErrorLength(), decoder.getErrorReason()); diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java index f8e1888cf1..68fde3ee71 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/CodecsModuleBuiltins.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -72,6 +72,7 @@ import static com.oracle.graal.python.util.PythonUtils.tsLiteral; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; @@ -133,7 +134,9 @@ import com.oracle.graal.python.runtime.sequence.storage.ObjectSequenceStorage; import com.oracle.graal.python.util.CharsetMapping; import com.oracle.graal.python.util.CharsetMapping.NormalizeEncodingNameNode; +import com.oracle.graal.python.util.PythonUtils; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; +import com.oracle.truffle.api.HostCompilerDirectives; import com.oracle.truffle.api.dsl.Bind; import com.oracle.truffle.api.dsl.Cached; import com.oracle.truffle.api.dsl.Fallback; @@ -182,6 +185,44 @@ protected List> getNodeFa return CodecsModuleBuiltinsFactory.getFactories(); } + @GenerateUncached + @GenerateInline + public abstract static class CharsetLookupNode extends Node { + public abstract CharsetMapping.CharsetWrapper execute(Node inliningTarget, TruffleString name); + + @SuppressWarnings("unused") + @Specialization(guards = "name == cachedName", limit = "1") + static CharsetMapping.CharsetWrapper doCachedIdentity(TruffleString name, + @Cached("name") TruffleString cachedName, + @Cached("lookup(name)") CharsetMapping.CharsetWrapper cachedResult) { + return cachedResult; + } + + @SuppressWarnings("unused") + @Specialization(guards = "equals(name, cachedName, equalNode)", limit = "1", replaces = "doCachedIdentity") + static CharsetMapping.CharsetWrapper doCachedEqual(TruffleString name, + @Cached("name") TruffleString cachedName, + @Cached("lookup(name)") CharsetMapping.CharsetWrapper cachedResult, + @Cached TruffleString.EqualNode equalNode) { + return cachedResult; + } + + @Specialization(replaces = "doCachedEqual") + static CharsetMapping.CharsetWrapper doDynamic(Node inliningTarget, TruffleString name, + @Cached NormalizeEncodingNameNode normalizeEncodingNameNode) { + return CharsetMapping.getCharsetNormalized(normalizeEncodingNameNode.execute(inliningTarget, name)); + } + + @SuppressWarnings("unused") + static CharsetMapping.CharsetWrapper lookup(TruffleString name) { + return CharsetMapping.getCharsetNormalized(CharsetMapping.normalizeUncached(name)); + } + + static boolean equals(TruffleString a, TruffleString b, TruffleString.EqualNode equalNode) { + return equalNode.execute(a, b, TS_ENCODING); + } + } + @GenerateUncached @GenerateInline(false) // footprint reduction 48 -> 30 public abstract static class CodecsEncodeToJavaBytesNode extends Node { @@ -191,6 +232,11 @@ public abstract static class CodecsEncodeToJavaBytesNode extends Node { byte[] encode(VirtualFrame frame, Object self, TruffleString encoding, TruffleString errors, @Bind Node inliningTarget, @Cached CastToTruffleStringNode castTruffleStr, + @Cached TruffleString.IsValidNode isValidNode, + @Cached TruffleString.GetCodeRangeNode getCodeRangeNode, + @Cached InlinedConditionProfile fastPathProfile, + @Cached TruffleString.SwitchEncodingNode switchEncodingNode, + @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode, @Cached TruffleString.ToJavaStringNode toJavaStringNode, @Cached TruffleString.EqualNode equalNode, @Cached ErrorHandlers.CallEncodingErrorHandlerNode errorHandler, @@ -198,17 +244,67 @@ byte[] encode(VirtualFrame frame, Object self, TruffleString encoding, TruffleSt @CachedLibrary(limit = "3") PythonBufferAccessLibrary bufferLib, @Cached CastToJavaStringNode castToJavaStringNode, @Cached PRaiseNode raiseNode, - @Cached NormalizeEncodingNameNode normalizeEncodingNameNode) { + @Cached(inline = true) CharsetLookupNode charsetLookupNode) { TruffleString input = castTruffleStr.castKnownString(inliningTarget, self); - String inputStr = toJavaStringNode.execute(input); - CodingErrorAction errorAction = convertCodingErrorAction(errors, equalNode); - TruffleString normalizedEncoding = normalizeEncodingNameNode.execute(inliningTarget, encoding); - Charset charset = CharsetMapping.getCharsetNormalized(normalizedEncoding); - if (charset == null) { + CharsetMapping.CharsetWrapper charsetWrapper = charsetLookupNode.execute(inliningTarget, encoding); + if (charsetWrapper == null) { throw raiseNode.raise(inliningTarget, LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding); } + TruffleString.Encoding targetTStringEncoding = charsetWrapper.tStringEncoding(); + if (fastPathProfile.profile(inliningTarget, isValidNode.execute(input, TS_ENCODING) && targetTStringEncoding != null)) { + byte[] ret = fastPath(input, getCodeRangeNode, switchEncodingNode, copyToByteArrayNode, targetTStringEncoding, charsetWrapper); + if (ret != null) { + return ret; + } + } + return slowPath(frame, encoding, errors, inliningTarget, toJavaStringNode, equalNode, errorHandler, acquireLib, bufferLib, castToJavaStringNode, raiseNode, input, charsetWrapper); + } + + private static byte[] fastPath(TruffleString input, + TruffleString.GetCodeRangeNode getCodeRangeNode, + TruffleString.SwitchEncodingNode switchEncodingNode, + TruffleString.CopyToByteArrayNode copyToByteArrayNode, + TruffleString.Encoding targetTStringEncoding, + CharsetMapping.CharsetWrapper charsetWrapper) { + if (targetTStringEncoding == TruffleString.Encoding.US_ASCII || targetTStringEncoding == TruffleString.Encoding.ISO_8859_1) { + TruffleString.CodeRange codeRange = getCodeRangeNode.execute(input, TS_ENCODING); + if (codeRange.isSupersetOf(targetTStringEncoding == TruffleString.Encoding.US_ASCII ? TruffleString.CodeRange.LATIN_1 : TruffleString.CodeRange.BMP)) { + // string contains characters that cannot be represented in ASCII / LATIN-1. + // defer to slow path + return null; + } + } + TruffleString transcoded = switchEncodingNode.execute(input, targetTStringEncoding); + CharsetMapping.BOM bom = charsetWrapper.bom(); + byte[] ret = new byte[transcoded.byteLength(targetTStringEncoding) + (bom == null ? 0 : bom.bytes.length)]; + int startIndex; + if (bom == null) { + startIndex = 0; + } else { + System.arraycopy(bom.bytes, 0, ret, 0, bom.bytes.length); + startIndex = bom.bytes.length; + } + copyToByteArrayNode.execute(transcoded, 0, ret, startIndex, transcoded.byteLength(targetTStringEncoding), targetTStringEncoding); + return ret; + } + + @HostCompilerDirectives.InliningCutoff + private byte[] slowPath(VirtualFrame frame, TruffleString encoding, TruffleString errors, + Node inliningTarget, + TruffleString.ToJavaStringNode toJavaStringNode, + TruffleString.EqualNode equalNode, + ErrorHandlers.CallEncodingErrorHandlerNode errorHandler, + PythonBufferAcquireLibrary acquireLib, + PythonBufferAccessLibrary bufferLib, + CastToJavaStringNode castToJavaStringNode, + PRaiseNode raiseNode, + TruffleString input, + CharsetMapping.CharsetWrapper charsetWrapper) { + String inputStr = toJavaStringNode.execute(input); + CodingErrorAction errorAction = convertCodingErrorAction(errors, equalNode); TruffleEncoder encoder; ErrorHandlers.ErrorHandlerCache errorHandlerCache = new ErrorHandlers.ErrorHandlerCache(); + Charset charset = charsetWrapper.charset(); try { encoder = new TruffleEncoder(charset, inputStr, errorAction); while (!encoder.encodingStep()) { @@ -290,8 +386,12 @@ static Object decode(VirtualFrame frame, Object input, TruffleString encoding, T @Cached("createFor($node)") InteropCallData callData, @CachedLibrary(limit = "3") PythonBufferAcquireLibrary acquireLib, @CachedLibrary(limit = "3") PythonBufferAccessLibrary bufferLib, + @Cached TruffleString.FromByteArrayNode fromByteArrayNode, + @Cached TruffleString.IsValidNode isValidNode, + @Cached TruffleString.SwitchEncodingNode switchEncodingNode, + @Cached InlinedConditionProfile fastPathProfile, @Cached TruffleString.EqualNode equalNode, - @Cached NormalizeEncodingNameNode normalizeEncodingNameNode, + @Cached(inline = true) CharsetLookupNode charsetLookupNode, @Cached ErrorHandlers.CallDecodingErrorHandlerNode callDecodingErrorHandlerNode, @Cached TruffleString.ToJavaStringNode toJavaStringNode, @Cached InlinedBranchProfile inputReplaced, @@ -300,16 +400,56 @@ static Object decode(VirtualFrame frame, Object input, TruffleString encoding, T try { int len = bufferLib.getBufferLength(buffer); byte[] bytes = bufferLib.getInternalOrCopiedByteArray(buffer); - CodingErrorAction errorAction = convertCodingErrorAction(errors, equalNode); - TruffleString normalizedEncoding = normalizeEncodingNameNode.execute(inliningTarget, encoding); - Charset charset = CharsetMapping.getCharsetForDecodingNormalized(normalizedEncoding, bytes, len); + CharsetMapping.CharsetWrapper charset = charsetLookupNode.execute(inliningTarget, encoding); if (charset == null) { throw raiseNode.raise(inliningTarget, LookupError, ErrorMessages.UNKNOWN_ENCODING, encoding); } + CharsetMapping.BOM bom = charset.bom(); + int offset = 0; + if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN && bom != null) { + /* + * JDK's charsets for UTF-16 and UTF-32 default to big endian irrespective of + * the platform if there is no BOM. The UTF-16-LE and UTF-32-LE charsets reject + * big endian BOM. CPython defaults to platform endian and accepts both BOMs. + * So, in order to get the behavior we need, we have to take a peek at the + * possible BOM and if it has a BOM use the UTF-16/32 encoding and let it + * detect, otherwise default to UTF-16/32-LE. + */ + if (charset == CharsetMapping.UTF_16LE_BOM) { + if (len >= 2) { + short first = PythonUtils.ARRAY_ACCESSOR.getShort(bytes, 0); + if (first == (short) 0xFFFE) { + charset = CharsetMapping.UTF_16BE_BOM; + offset = 2; + } else if (first == (short) 0xFEFF) { + offset = 2; + } + } + } else { + assert charset == CharsetMapping.UTF_32LE_BOM; + if (len >= 4) { + int first = PythonUtils.ARRAY_ACCESSOR.getInt(bytes, 0); + if (first == 0xFFFE0000) { + charset = CharsetMapping.UTF_32BE_BOM; + offset = 4; + } else if (first == 0x0000FEFF) { + offset = 4; + } + } + } + } + TruffleString.Encoding tStringEncoding = charset.tStringEncoding(); + if (tStringEncoding != null && (len & (charset.stride() - 1)) == 0) { + TruffleString direct = fromByteArrayNode.execute(bytes, offset, len - offset, tStringEncoding, true); + if (fastPathProfile.profile(inliningTarget, isValidNode.execute(direct, tStringEncoding))) { + return PFactory.createTuple(language, new Object[]{switchEncodingNode.execute(direct, TS_ENCODING), len}); + } + } + CodingErrorAction errorAction = convertCodingErrorAction(errors, equalNode); ErrorHandlers.ErrorHandlerCache handlerCache = new ErrorHandlers.ErrorHandlerCache(); TruffleDecoder decoder; try { - decoder = new TruffleDecoder(normalizedEncoding, charset, bytes, len, errorAction); + decoder = new TruffleDecoder(charset.charset(), bytes, len, errorAction); while (!decoder.decodingStep(finalData)) { int pos = decoder.getInputPosition(); ErrorHandlers.DecodingErrorHandlerResult result = callDecodingErrorHandlerNode.execute(frame, inliningTarget, handlerCache, errors, encoding, input, @@ -1367,15 +1507,13 @@ public void replace(String replacement, Charset charset) { } static class TruffleDecoder { - private final TruffleString encodingName; private final CharsetDecoder decoder; private ByteBuffer inputBuffer; private CharBuffer outputBuffer; private CoderResult coderResult; @TruffleBoundary - public TruffleDecoder(TruffleString encodingName, Charset charset, byte[] input, int inputLen, CodingErrorAction errorAction) { - this.encodingName = encodingName; + public TruffleDecoder(Charset charset, byte[] input, int inputLen, CodingErrorAction errorAction) { this.inputBuffer = ByteBuffer.wrap(input, 0, inputLen); this.decoder = charset.newDecoder().onMalformedInput(errorAction).onUnmappableCharacter(errorAction); this.outputBuffer = CharBuffer.allocate((int) (inputLen * decoder.averageCharsPerByte())); @@ -1473,8 +1611,5 @@ public void replace(int skipInput, char[] chars, int offset, int length) { inputBuffer.position(inputBuffer.position() + skipInput); } - public TruffleString getEncodingName() { - return encodingName; - } } } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java index 4bf3baf235..76cb8a1a3a 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/MarshalModuleBuiltins.java @@ -1236,12 +1236,12 @@ private void writeShortString(String v) throws IOException { private TruffleString readShortString() { int sz = readByteSize(); byte[] bytes = readNBytes(sz); - return TruffleString.fromByteArrayUncached(bytes, 0, sz, Encoding.ISO_8859_1, false).switchEncodingUncached(TS_ENCODING); + return TruffleString.fromByteArrayWithCompactionUTF32Uncached(bytes, 0, sz, TruffleString.CompactionLevel.S1, false); } private Object readAscii(int sz, boolean intern) { byte[] bytes = readNBytes(sz); - TruffleString value = TruffleString.fromByteArrayUncached(bytes, 0, sz, Encoding.US_ASCII, false).switchEncodingUncached(TS_ENCODING); + TruffleString value = TruffleString.fromByteArrayWithCompactionUTF32Uncached(bytes, 0, sz, TruffleString.CompactionLevel.S1, false); if (intern) { return PythonUtils.internString(value); } else { diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java index ea8e21333d..4d3e669ea0 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cext/PythonCextUnicodeBuiltins.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -80,8 +80,6 @@ import static com.oracle.graal.python.nodes.StringLiterals.T_UTF8; import static com.oracle.graal.python.nodes.util.CastToJavaIntLossyNode.castLong; import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING; -import static com.oracle.truffle.api.strings.TruffleString.Encoding.ISO_8859_1; -import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_16; import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_16LE; import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_32LE; import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_8; @@ -175,6 +173,7 @@ import com.oracle.truffle.api.strings.TruffleString; import com.oracle.truffle.api.strings.TruffleString.Encoding; import com.oracle.truffle.api.strings.TruffleString.FromNativePointerNode; +import com.oracle.truffle.api.strings.TruffleString.FromNativePointerWithCompactionUTF32Node; import com.oracle.truffle.api.strings.TruffleString.SwitchEncodingNode; import com.oracle.truffle.api.strings.TruffleStringBuilder; import com.oracle.truffle.api.strings.TruffleStringBuilderUTF32; @@ -810,11 +809,11 @@ static Object doGeneric(Object ptr, long elements, int charSize, int isAscii, @CApiBuiltin(ret = PyObjectTransfer, args = {Pointer, Py_ssize_t, Int}, call = Ignored) abstract static class GraalPyPrivate_Unicode_FromUCS extends CApiTernaryBuiltinNode { - private static Encoding encodingFromKind(Node inliningTarget, int kind, PRaiseNode raiseNode) throws PException { + private static TruffleString.CompactionLevel compactionLevelFromKind(Node inliningTarget, int kind, PRaiseNode raiseNode) throws PException { return switch (kind) { - case 1 -> ISO_8859_1; - case 2 -> UTF_16; - case 4 -> TS_ENCODING; + case 1 -> TruffleString.CompactionLevel.S1; + case 2 -> TruffleString.CompactionLevel.S2; + case 4 -> TruffleString.CompactionLevel.S4; default -> throw raiseNode.raiseBadInternalCall(inliningTarget); }; } @@ -822,19 +821,13 @@ private static Encoding encodingFromKind(Node inliningTarget, int kind, PRaiseNo @Specialization static Object doNative(Object ptr, long byteLength, int kind, @Bind Node inliningTarget, - @Cached FromNativePointerNode fromNativePointerNode, - @Cached SwitchEncodingNode switchEncodingNode, + @Cached FromNativePointerWithCompactionUTF32Node fromNativePointerNode, @Cached PRaiseNode raiseNode) { try { int iByteLength = PInt.intValueExact(byteLength); - Encoding srcEncoding = encodingFromKind(inliningTarget, kind, raiseNode); - /* - * TODO(fa): TruffleString does currently not support creating strings from UCS1 and - * UCS2 bytes (GR-44312). Remind: UCS1 and UCS2 are actually compacted UTF-32 bytes. - * For now, we use ISO-8859-1 and UTF-16 but that's not entirely correct. - */ - TruffleString ts = fromNativePointerNode.execute(ptr, 0, iByteLength, srcEncoding, true); - return PFactory.createString(PythonLanguage.get(inliningTarget), switchEncodingNode.execute(ts, TS_ENCODING)); + TruffleString.CompactionLevel compactionLevel = compactionLevelFromKind(inliningTarget, kind, raiseNode); + TruffleString ts = fromNativePointerNode.execute(ptr, 0, iByteLength, compactionLevel, true); + return PFactory.createString(PythonLanguage.get(inliningTarget), ts); } catch (OverflowException e) { throw raiseNode.raise(inliningTarget, MemoryError); } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cjkcodecs/MultibytecodecModuleBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cjkcodecs/MultibytecodecModuleBuiltins.java index c7fbfab82e..fc9bc375cd 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cjkcodecs/MultibytecodecModuleBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/cjkcodecs/MultibytecodecModuleBuiltins.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -47,7 +47,6 @@ import static com.oracle.graal.python.runtime.exception.PythonErrorType.ValueError; import static com.oracle.graal.python.util.PythonUtils.toTruffleStringUncached; -import java.nio.charset.Charset; import java.util.List; import com.oracle.graal.python.PythonLanguage; @@ -102,13 +101,13 @@ protected static void registerCodec(String name, int cidx, CodecType ct, int mid PythonModule codec, PythonLanguage language) { TruffleString tsName = toTruffleStringUncached(name); TruffleString normalizedEncoding = CharsetMapping.normalizeUncached(tsName); - Charset charset = CharsetMapping.getCharsetNormalized(normalizedEncoding); + CharsetMapping.CharsetWrapper charset = CharsetMapping.getCharsetNormalized(normalizedEncoding); if (charset != null) { if (cidx != -1) { - codecs[cidx] = new MultibyteCodec(tsName, charset, ct); + codecs[cidx] = new MultibyteCodec(tsName, charset.charset(), ct); } if (midx != -1) { - DBCSMap h = maps[midx] = new DBCSMap(name, tsName, charset, mt); + DBCSMap h = maps[midx] = new DBCSMap(name, tsName, charset.charset(), mt); codec.setAttribute(toTruffleStringUncached(h.charsetMapName), PFactory.createCapsuleJavaName(language, h, PyMultibyteCodec_CAPSULE_NAME)); } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/CharmapNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/CharmapNodes.java index bd6962ace1..584f6c5fb3 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/CharmapNodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/CharmapNodes.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -369,15 +369,14 @@ static TruffleString decodeLatin1(VirtualFrame frame, Object data, @SuppressWarn @Shared @Cached("createFor($node)") InteropCallData callData, @CachedLibrary("data") PythonBufferAcquireLibrary bufferAcquireLib, @CachedLibrary(limit = "3") @Shared PythonBufferAccessLibrary bufferLib, - @Cached TruffleString.FromByteArrayNode fromByteArrayNode, + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode, @Cached TruffleString.SwitchEncodingNode switchEncodingNode) { // equivalent of PyUnicode_DecodeLatin1 Object dataBuffer = bufferAcquireLib.acquireReadonly(data, frame, context, context.getLanguage(inliningTarget), callData); try { int len = bufferLib.getBufferLength(dataBuffer); byte[] src = bufferLib.getInternalOrCopiedByteArray(dataBuffer); - TruffleString latin1 = fromByteArrayNode.execute(src, 0, len, TruffleString.Encoding.ISO_8859_1, true); - return switchEncodingNode.execute(latin1, TS_ENCODING); + return fromByteArrayNode.execute(src, 0, len, TruffleString.CompactionLevel.S1, true); } finally { bufferLib.release(dataBuffer, frame, callData); } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/ErrorHandlers.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/ErrorHandlers.java index 62c6d0a47a..c7103d9d74 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/ErrorHandlers.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/codecs/ErrorHandlers.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -364,8 +364,7 @@ static Object doEncode(PBaseException exception, @Cached PyUnicodeEncodeOrTranslateErrorGetStartNode getStartNode, @Cached PyUnicodeEncodeOrTranslateErrorGetEndNode getEndNode, @Cached TruffleString.CodePointAtIndexUTF32Node codePointAtIndexNode, - @Cached TruffleString.FromByteArrayNode fromByteArrayNode, - @Cached TruffleString.SwitchEncodingNode switchEncodingNode) { + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) { TruffleString src = getObjectNode.execute(inliningTarget, exception); int start = getStartNode.execute(inliningTarget, exception); int end = getEndNode.execute(inliningTarget, exception); @@ -378,8 +377,8 @@ static Object doEncode(PBaseException exception, for (int i = start; i < end; ++i) { pos = appendXmlCharRefReplacement(replacement, pos, codePointAtIndexNode.execute(src, i)); } - TruffleString resultAscii = fromByteArrayNode.execute(replacement, Encoding.US_ASCII, false); - return PFactory.createTuple(language, new Object[]{switchEncodingNode.execute(resultAscii, TS_ENCODING), end}); + TruffleString resultAscii = fromByteArrayNode.execute(replacement, 0, replacement.length, TruffleString.CompactionLevel.S1, false); + return PFactory.createTuple(language, new Object[]{resultAscii, end}); } @Specialization(guards = "!isEncode(inliningTarget, o, pyObjectTypeCheck)", limit = "1") @@ -405,8 +404,7 @@ static Object doDecodeException(VirtualFrame frame, PBaseException exception, @Cached PyUnicodeDecodeErrorGetEndNode getEndNode, @CachedLibrary(limit = "3") PythonBufferAcquireLibrary acquireLib, @CachedLibrary(limit = "3") PythonBufferAccessLibrary accessLib, - @Cached @Shared TruffleString.FromByteArrayNode fromByteArrayNode, - @Cached @Shared TruffleString.SwitchEncodingNode switchEncodingNode) { + @Cached @Shared TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) { int start = getStartNode.execute(inliningTarget, exception); int end = getEndNode.execute(inliningTarget, exception); Object object = getObjectNode.execute(inliningTarget, exception); @@ -424,8 +422,8 @@ static Object doDecodeException(VirtualFrame frame, PBaseException exception, } finally { accessLib.release(srcBuf, frame, callData); } - TruffleString resultAscii = fromByteArrayNode.execute(replacement, Encoding.US_ASCII, false); - return PFactory.createTuple(language, new Object[]{switchEncodingNode.execute(resultAscii, TS_ENCODING), end}); + TruffleString resultAscii = fromByteArrayNode.execute(replacement, 0, replacement.length, TruffleString.CompactionLevel.S1, false); + return PFactory.createTuple(language, new Object[]{resultAscii, end}); } @Specialization(guards = "isEncodeOrTranslate(inliningTarget, exception, pyObjectTypeCheck)", limit = "1") @@ -437,8 +435,7 @@ static Object doEncodeOrTranslateException(PBaseException exception, @Cached PyUnicodeEncodeOrTranslateErrorGetStartNode getStartNode, @Cached PyUnicodeEncodeOrTranslateErrorGetEndNode getEndNode, @Cached TruffleString.CodePointAtIndexUTF32Node codePointAtIndexNode, - @Cached @Shared TruffleString.FromByteArrayNode fromByteArrayNode, - @Cached @Shared TruffleString.SwitchEncodingNode switchEncodingNode) { + @Cached @Shared TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) { int start = getStartNode.execute(inliningTarget, exception); int end = getEndNode.execute(inliningTarget, exception); TruffleString src = getObjectNode.execute(inliningTarget, exception); @@ -462,8 +459,8 @@ static Object doEncodeOrTranslateException(PBaseException exception, int cp = codePointAtIndexNode.execute(src, i); pos = BytesUtils.unicodeNonAsciiEscape(cp, pos, replacement, true); } - TruffleString resultAscii = fromByteArrayNode.execute(replacement, Encoding.US_ASCII, false); - return PFactory.createTuple(language, new Object[]{switchEncodingNode.execute(resultAscii, TS_ENCODING), end}); + TruffleString resultAscii = fromByteArrayNode.execute(replacement, 0, replacement.length, TruffleString.CompactionLevel.S1, false); + return PFactory.createTuple(language, new Object[]{resultAscii, end}); } @Specialization(guards = "isNeither(inliningTarget, o, pyObjectTypeCheck)", limit = "1") @@ -487,8 +484,7 @@ static Object doEncode(PBaseException exception, @Cached PyUnicodeEncodeOrTranslateErrorGetStartNode getStartNode, @Cached PyUnicodeEncodeOrTranslateErrorGetEndNode getEndNode, @Cached TruffleString.CodePointAtIndexUTF32Node codePointAtIndexNode, - @Cached TruffleString.FromByteArrayNode fromByteArrayNode, - @Cached TruffleString.SwitchEncodingNode switchEncodingNode, + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode, @Cached TruffleString.FromJavaStringNode fromJavaStringNode, @Cached TruffleStringBuilder.AppendStringNode appendStringNode, @Cached TruffleStringBuilder.AppendCodePointNode appendCodePointNode, @@ -512,7 +508,7 @@ static Object doEncode(PBaseException exception, appendCodePointNode.execute(tsb, '}'); } else { int len = BytesUtils.unicodeNonAsciiEscape(cp, 0, buf, true); - appendStringNode.execute(tsb, switchEncodingNode.execute(fromByteArrayNode.execute(buf, 0, len, Encoding.US_ASCII, true), TS_ENCODING)); + appendStringNode.execute(tsb, fromByteArrayNode.execute(buf, 0, len, TruffleString.CompactionLevel.S1, true)); } } return PFactory.createTuple(language, new Object[]{toStringNode.execute(tsb), end}); diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PPickler.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PPickler.java index 21d27a0e36..c02ee31098 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PPickler.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PPickler.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -1327,7 +1327,7 @@ private void saveBytes(VirtualFrame frame, PythonContext ctx, PPickler pickler, reduceValue = createTuple(ctx.getCore().lookupType(PythonBuiltinClassType.PBytes), createTuple()); } else { PickleState st = getGlobalState(ctx.getCore()); - final TruffleString unicodeStr = PickleUtils.decodeLatin1Strict(getBufferLibrary().getCopiedByteArray(buffer), ensureTsFromByteArray(), ensureTsSwitchEncodingNode()); + final TruffleString unicodeStr = PickleUtils.decodeLatin1Strict(getBufferLibrary().getCopiedByteArray(buffer), ensureTsFromByteArrayWithCompaction()); reduceValue = createTuple(st.codecsEncode, createTuple(unicodeStr, LATIN1)); } // save_reduce() will memoize the object automatically. diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PUnpickler.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PUnpickler.java index 635f4e2020..4c00d8ac08 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PUnpickler.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PUnpickler.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -871,7 +871,7 @@ private void loadInt(VirtualFrame frame, PUnpickler self) { } try { - long x = PickleUtils.asciiBytesToLong(s, ensureTsParseLongNode(), ensureTsFromByteArray()); + long x = PickleUtils.asciiBytesToLong(s, ensureTsParseLongNode(), ensureTsFromByteArrayWithCompaction()); if (s.length == 3 && (x == 0 || x == 1)) { value = x != 0; } else if (x == (int) x) { @@ -900,7 +900,7 @@ private void loadLong(VirtualFrame frame, PUnpickler self) { s[s.length - 2] = 0; } try { - value = PickleUtils.asciiBytesToLong(s, ensureTsParseLongNode(), ensureTsFromByteArray()); + value = PickleUtils.asciiBytesToLong(s, ensureTsParseLongNode(), ensureTsFromByteArrayWithCompaction()); } catch (TruffleString.NumberFormatException nfe) { value = parseInt(s); } @@ -1505,7 +1505,7 @@ private void loadGet(VirtualFrame frame, PUnpickler self) { } int idx; try { - idx = PickleUtils.asciiBytesToInt(s, ensureTsParseIntNode(), ensureTsFromByteArray()); + idx = PickleUtils.asciiBytesToInt(s, ensureTsParseIntNode(), ensureTsFromByteArrayWithCompaction()); } catch (TruffleString.NumberFormatException nfe) { // TODO handle exception [GR-38101] throw CompilerDirectives.shouldNotReachHere(); @@ -1572,7 +1572,7 @@ private void loadPut(VirtualFrame frame, PUnpickler self) { Object value = self.stack.data[self.stack.size - 1]; int idx; try { - idx = PickleUtils.asciiBytesToInt(s, ensureTsParseIntNode(), ensureTsFromByteArray()); + idx = PickleUtils.asciiBytesToInt(s, ensureTsParseIntNode(), ensureTsFromByteArrayWithCompaction()); } catch (TruffleString.NumberFormatException nfe) { // TODO handle exception [GR-38101] throw CompilerDirectives.shouldNotReachHere(); diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PickleUtils.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PickleUtils.java index 683df9bc07..276a5c3d0c 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PickleUtils.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PickleUtils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -257,21 +257,21 @@ public static int getStringSize(byte[] bytes) { } public static TruffleString getValidIntString(byte[] bytes) { - return getValidIntASCIIString(bytes, TruffleString.FromByteArrayNode.getUncached()); + return getValidIntASCIIString(bytes, TruffleString.FromByteArrayWithCompactionUTF32Node.getUncached()); } - public static int asciiBytesToInt(byte[] bytes, TruffleString.ParseIntNode parseIntNode, TruffleString.FromByteArrayNode fromByteArrayNode) + public static int asciiBytesToInt(byte[] bytes, TruffleString.ParseIntNode parseIntNode, TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) throws TruffleString.NumberFormatException { return parseIntNode.execute(getValidIntASCIIString(bytes, fromByteArrayNode), 10); } - public static long asciiBytesToLong(byte[] bytes, TruffleString.ParseLongNode parseLongNode, TruffleString.FromByteArrayNode fromByteArrayNode) + public static long asciiBytesToLong(byte[] bytes, TruffleString.ParseLongNode parseLongNode, TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) throws TruffleString.NumberFormatException { return parseLongNode.execute(getValidIntASCIIString(bytes, fromByteArrayNode), 10); } - private static TruffleString getValidIntASCIIString(byte[] bytes, TruffleString.FromByteArrayNode fromByteArray) { - return fromByteArray.execute(bytes, 0, getStringSize(bytes), TruffleString.Encoding.US_ASCII, true); + private static TruffleString getValidIntASCIIString(byte[] bytes, TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArray) { + return fromByteArray.execute(bytes, 0, getStringSize(bytes), TruffleString.CompactionLevel.S1, true); } @TruffleBoundary @@ -316,8 +316,8 @@ public static TruffleString decodeUTF8Strict(byte[] data, int len, TruffleString return decodeStrict(data, len, fromByteArrayNode, TruffleString.Encoding.UTF_8, switchEncodingNode); } - public static TruffleString decodeLatin1Strict(byte[] data, TruffleString.FromByteArrayNode fromByteArrayNode, TruffleString.SwitchEncodingNode switchEncodingNode) { - return decodeStrict(data, data.length, fromByteArrayNode, TruffleString.Encoding.ISO_8859_1, switchEncodingNode); + public static TruffleString decodeLatin1Strict(byte[] data, TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) { + return fromByteArrayNode.execute(data, 0, data.length, TruffleString.CompactionLevel.S1, true); } private static TruffleString decodeStrict(byte[] data, int len, TruffleString.FromByteArrayNode fromByteArrayNode, TruffleString.Encoding encoding, diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PicklerNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PicklerNodes.java index e838b83e36..6e38abab05 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PicklerNodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pickle/PicklerNodes.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -152,6 +152,7 @@ abstract static class BasePickleNode extends Node { @Child private BytesNodes.ToBytesNode toBytesNode; @Child private PyObjectReprAsTruffleStringNode reprNode; @Child private TruffleString.FromByteArrayNode tsFromByteArrayNode; + @Child private TruffleString.FromByteArrayWithCompactionUTF32Node tsFromByteArrayWithCompactionNode; @Child private TruffleString.CodePointLengthNode tsCodePointLengthNode; @Child private TruffleString.CodePointAtIndexUTF32Node tsCodePointAtIndexUTF32Node; @Child private TruffleString.FromLongNode tsFromLongNode; @@ -190,6 +191,14 @@ protected TruffleString.FromByteArrayNode ensureTsFromByteArray() { return tsFromByteArrayNode; } + protected TruffleString.FromByteArrayWithCompactionUTF32Node ensureTsFromByteArrayWithCompaction() { + if (tsFromByteArrayWithCompactionNode == null) { + CompilerDirectives.transferToInterpreterAndInvalidate(); + tsFromByteArrayWithCompactionNode = insert(TruffleString.FromByteArrayWithCompactionUTF32Node.create()); + } + return tsFromByteArrayWithCompactionNode; + } + protected TruffleString.CodePointLengthNode ensureTsCodePointLengthNode() { if (tsCodePointLengthNode == null) { CompilerDirectives.transferToInterpreterAndInvalidate(); diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesNodes.java index 174262daa0..8daf58a4f1 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesNodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesNodes.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -40,7 +40,6 @@ */ package com.oracle.graal.python.builtins.objects.bytes; -import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.createASCIIString; import static com.oracle.graal.python.builtins.objects.bytes.BytesUtils.isSpace; import static com.oracle.graal.python.builtins.objects.cext.structs.CFields.PyBytesObject__ob_sval; import static com.oracle.graal.python.builtins.objects.cext.structs.CFields.PyVarObject__ob_size; @@ -707,8 +706,7 @@ public abstract static class ByteToHexNode extends PNodeWithContext { @Specialization(guards = "bytesPerSepGroup == 0") static TruffleString zero(byte[] argbuf, int arglen, @SuppressWarnings("unused") byte sep, @SuppressWarnings("unused") int bytesPerSepGroup, - @Shared @Cached TruffleString.FromByteArrayNode fromByteArrayNode, - @Shared @Cached TruffleString.SwitchEncodingNode switchEncodingNode) { + @Shared @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) { int resultlen = arglen * 2; byte[] retbuf = new byte[resultlen]; @@ -719,14 +717,13 @@ static TruffleString zero(byte[] argbuf, int arglen, @SuppressWarnings("unused") retbuf[j++] = BytesUtils.HEXDIGITS[c >>> 4]; retbuf[j++] = BytesUtils.HEXDIGITS[c & 0x0f]; } - return createASCIIString(retbuf, fromByteArrayNode, switchEncodingNode); + return fromByteArrayNode.execute(retbuf, 0, retbuf.length, TruffleString.CompactionLevel.S1, false); } @Specialization(guards = "bytesPerSepGroup < 0") static TruffleString negative(Node inliningTarget, byte[] argbuf, int arglen, byte sep, int bytesPerSepGroup, @Shared @Cached InlinedConditionProfile earlyExit, - @Shared @Cached TruffleString.FromByteArrayNode fromByteArrayNode, - @Shared @Cached TruffleString.SwitchEncodingNode switchEncodingNode, + @Shared @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode, @Shared @Cached PRaiseNode raiseNode) { if (earlyExit.profile(inliningTarget, arglen == 0)) { return T_EMPTY_STRING; @@ -741,7 +738,7 @@ static TruffleString negative(Node inliningTarget, byte[] argbuf, int arglen, by resultlen += arglen * 2; if (absBytesPerSepGroup >= arglen) { - return zero(argbuf, arglen, sep, 0, fromByteArrayNode, switchEncodingNode); + return zero(argbuf, arglen, sep, 0, fromByteArrayNode); } byte[] retbuf = new byte[resultlen]; @@ -761,14 +758,13 @@ static TruffleString negative(Node inliningTarget, byte[] argbuf, int arglen, by retbuf[j++] = BytesUtils.HEXDIGITS[c & 0x0f]; } - return createASCIIString(retbuf, fromByteArrayNode, switchEncodingNode); + return fromByteArrayNode.execute(retbuf, 0, retbuf.length, TruffleString.CompactionLevel.S1, false); } @Specialization(guards = "absBytesPerSepGroup > 0") static TruffleString positive(Node inliningTarget, byte[] argbuf, int arglen, byte sep, int absBytesPerSepGroup, @Shared @Cached InlinedConditionProfile earlyExit, - @Shared @Cached TruffleString.FromByteArrayNode fromByteArrayNode, - @Shared @Cached TruffleString.SwitchEncodingNode switchEncodingNode, + @Shared @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode, @Shared @Cached PRaiseNode raiseNode) { if (earlyExit.profile(inliningTarget, arglen == 0)) { return T_EMPTY_STRING; @@ -783,7 +779,7 @@ static TruffleString positive(Node inliningTarget, byte[] argbuf, int arglen, by resultlen += arglen * 2; if (absBytesPerSepGroup >= arglen) { - return zero(argbuf, arglen, sep, 0, fromByteArrayNode, switchEncodingNode); + return zero(argbuf, arglen, sep, 0, fromByteArrayNode); } byte[] retbuf = new byte[resultlen]; @@ -803,7 +799,7 @@ static TruffleString positive(Node inliningTarget, byte[] argbuf, int arglen, by retbuf[j--] = BytesUtils.HEXDIGITS[c & 0x0f]; retbuf[j--] = BytesUtils.HEXDIGITS[c >>> 4]; } - return createASCIIString(retbuf, fromByteArrayNode, switchEncodingNode); + return fromByteArrayNode.execute(retbuf, 0, retbuf.length, TruffleString.CompactionLevel.S1, false); } } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java index 8b6c1f2bb2..9906e17073 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/bytes/BytesUtils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2025, Oracle and/or its affiliates. + * Copyright (c) 2017, 2026, Oracle and/or its affiliates. * Copyright (c) 2014, Regents of the University of California * * All rights reserved. @@ -25,14 +25,11 @@ */ package com.oracle.graal.python.builtins.objects.bytes; -import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING; - import java.io.ByteArrayOutputStream; import com.oracle.truffle.api.CompilerAsserts; import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary; -import com.oracle.truffle.api.strings.TruffleString; import com.oracle.truffle.api.strings.TruffleStringBuilder; import com.oracle.truffle.api.strings.TruffleStringBuilderUTF32; @@ -473,11 +470,6 @@ public static int digitValue(byte hexChar) { return 37; } - @TruffleBoundary - public static TruffleString createASCIIString(byte[] retbuf, TruffleString.FromByteArrayNode fromByteArrayNode, TruffleString.SwitchEncodingNode switchEncodingNode) { - return switchEncodingNode.execute(fromByteArrayNode.execute(retbuf, TruffleString.Encoding.US_ASCII), TS_ENCODING); - } - @TruffleBoundary public static ByteArrayOutputStream createOutputStream() { return new ByteArrayOutputStream(); diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeStringData.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeStringData.java index abfd18d625..c1ca2d8d36 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeStringData.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/NativeStringData.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2025, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -80,15 +80,14 @@ public int length() { return storage.length(); } - public TruffleString toTruffleString(TruffleString.FromNativePointerNode fromNativePointerNode) { - TruffleString.Encoding encoding = switch (kind) { - case KIND_ASCII -> TruffleString.Encoding.US_ASCII; - case KIND_1BYTE -> TruffleString.Encoding.ISO_8859_1; - case KIND_2BYTE -> TruffleString.Encoding.UTF_16; - case KIND_4BYTE -> TruffleString.Encoding.UTF_32; + public TruffleString toTruffleString(TruffleString.FromNativePointerWithCompactionUTF32Node fromNativePointerNode) { + TruffleString.CompactionLevel compactionLevel = switch (kind) { + case KIND_ASCII, KIND_1BYTE -> TruffleString.CompactionLevel.S1; + case KIND_2BYTE -> TruffleString.CompactionLevel.S2; + case KIND_4BYTE -> TruffleString.CompactionLevel.S4; default -> throw CompilerDirectives.shouldNotReachHere(); }; // NativeByteSequenceStorage implements asPointer - return fromNativePointerNode.execute(storage, 0, storage.length(), encoding, false); + return fromNativePointerNode.execute(storage, 0, storage.length(), compactionLevel, false); } } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java index 6b7afc2514..f082764b07 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringBuiltins.java @@ -892,7 +892,7 @@ static TruffleString lowerAscii(TruffleString self, @Cached TruffleString.SwitchEncodingNode switchEncodingNode, @Cached TruffleString.ByteIndexOfCodePointSetNode indexOfCodePointSetNode, @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode, - @Cached TruffleString.FromByteArrayNode fromByteArrayNode) { + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) { TruffleString ascii = switchEncodingNode.execute(self, Encoding.US_ASCII); int i = indexOfCodePointSetNode.execute(ascii, 0, ascii.byteLength(Encoding.US_ASCII), ASCII_UPPER); if (i < 0) { @@ -905,7 +905,7 @@ static TruffleString lowerAscii(TruffleString self, buf[i] = (byte) (buf[i] - 'A' + 'a'); } } - return switchEncodingNode.execute(fromByteArrayNode.execute(buf, Encoding.US_ASCII, false), TS_ENCODING); + return fromByteArrayNode.execute(buf, 0, buf.length, TruffleString.CompactionLevel.S1, false); } @Specialization(guards = "!isAscii(self, getCodeRangeNode)") @@ -939,7 +939,7 @@ static TruffleString upperAscii(TruffleString self, @Cached TruffleString.SwitchEncodingNode switchEncodingNode, @Cached TruffleString.ByteIndexOfCodePointSetNode indexOfCodePointSetNode, @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode, - @Cached TruffleString.FromByteArrayNode fromByteArrayNode) { + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) { TruffleString ascii = switchEncodingNode.execute(self, Encoding.US_ASCII); int i = indexOfCodePointSetNode.execute(ascii, 0, ascii.byteLength(Encoding.US_ASCII), ASCII_LOWER); if (i < 0) { @@ -952,7 +952,7 @@ static TruffleString upperAscii(TruffleString self, buf[i] = (byte) (buf[i] - 'a' + 'A'); } } - return switchEncodingNode.execute(fromByteArrayNode.execute(buf, Encoding.US_ASCII, false), TS_ENCODING); + return fromByteArrayNode.execute(buf, 0, buf.length, TruffleString.CompactionLevel.S1, false); } @Specialization(guards = "!isAscii(self, getCodeRangeNode)") diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringNodes.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringNodes.java index 1f7217c5bd..2d40036a59 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringNodes.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/str/StringNodes.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -125,7 +125,7 @@ static TruffleString doMaterialized(PString x) { @InliningCutoff static TruffleString doNative(Node inliningTarget, PString x, @Cached HiddenAttr.ReadNode readAttrNode, - @Cached TruffleString.FromNativePointerNode fromNativePointerNode) { + @Cached TruffleString.FromNativePointerWithCompactionUTF32Node fromNativePointerNode) { NativeStringData nativeData = x.getNativeStringData(inliningTarget, readAttrNode); TruffleString materialized = nativeData.toTruffleString(fromNativePointerNode); x.setMaterialized(materialized); diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/struct/StructBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/struct/StructBuiltins.java index 48faca88dc..cef73d8aa4 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/struct/StructBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/objects/struct/StructBuiltins.java @@ -1,4 +1,4 @@ -/* Copyright (c) 2020, 2025, Oracle and/or its affiliates. +/* Copyright (c) 2020, 2026, Oracle and/or its affiliates. * Copyright (C) 1996-2020 Python Software Foundation * * Licensed under the PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 @@ -63,7 +63,6 @@ import static com.oracle.graal.python.nodes.ErrorMessages.UNPACK_REQ_A_BUFFER_OF_N_BYTES; import static com.oracle.graal.python.runtime.exception.PythonErrorType.StructError; import static com.oracle.graal.python.runtime.exception.PythonErrorType.TypeError; -import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING; import java.nio.ByteOrder; import java.util.HashSet; @@ -678,9 +677,9 @@ protected Object get(PStruct self) { public abstract static class GetStructFormat extends PythonBuiltinNode { @Specialization protected Object get(PStruct self, - @Cached TruffleString.FromByteArrayNode fromBytes, - @Cached TruffleString.SwitchEncodingNode switchEncoding) { - return switchEncoding.execute(fromBytes.execute(self.getFormat(), TruffleString.Encoding.US_ASCII), TS_ENCODING); + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromBytes) { + byte[] format = self.getFormat(); + return fromBytes.execute(format, 0, format.length, TruffleString.CompactionLevel.S1, false); } } } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyNumberLongNode.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyNumberLongNode.java index 831553f7ed..fe4e005d79 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyNumberLongNode.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyNumberLongNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2025, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -44,7 +44,6 @@ import static com.oracle.graal.python.nodes.SpecialMethodNames.J___INT__; import static com.oracle.graal.python.nodes.SpecialMethodNames.J___TRUNC__; import static com.oracle.graal.python.nodes.SpecialMethodNames.T___TRUNC__; -import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING; import com.oracle.graal.python.builtins.PythonBuiltinClassType; import com.oracle.graal.python.builtins.modules.WarningsModuleBuiltins; @@ -241,8 +240,7 @@ public abstract static class LongFromBufferNode extends Node { @InliningCutoff static Object doGeneric(VirtualFrame frame, Object object, int base, @Bind Node inliningTarget, - @Cached TruffleString.FromByteArrayNode fromByteArrayNode, - @Cached TruffleString.SwitchEncodingNode switchEncodingNode, + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode, @Cached PyLongFromUnicodeObject fromString, @Cached(value = "createFor($node)") InteropCallData callData, @CachedLibrary(limit = "3") PythonBufferAcquireLibrary acquireLib, @@ -256,8 +254,7 @@ static Object doGeneric(VirtualFrame frame, Object object, int base, try { byte[] bytes = bufferLib.getInternalOrCopiedByteArray(buffer); int len = bufferLib.getBufferLength(buffer); - TruffleString string = fromByteArrayNode.execute(bytes, 0, len, TruffleString.Encoding.US_ASCII, false); - string = switchEncodingNode.execute(string, TS_ENCODING); + TruffleString string = fromByteArrayNode.execute(bytes, 0, len, TruffleString.CompactionLevel.S1, false); return fromString.execute(inliningTarget, string, base, bytes, len); } finally { bufferLib.release(buffer); diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyObjectAsciiNode.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyObjectAsciiNode.java index f1d4063e88..ced7a5f676 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyObjectAsciiNode.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/lib/PyObjectAsciiNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -54,7 +54,6 @@ import com.oracle.truffle.api.frame.VirtualFrame; import com.oracle.truffle.api.nodes.Node; import com.oracle.truffle.api.strings.TruffleString; -import com.oracle.truffle.api.strings.TruffleString.Encoding; import com.oracle.truffle.api.strings.TruffleStringIterator; /** @@ -81,8 +80,7 @@ public static TruffleString ascii(VirtualFrame frame, Node inliningTarget, Objec @Cached TruffleString.CreateCodePointIteratorNode createCodePointIteratorNode, @Cached TruffleStringIterator.NextNode nextNode, @Cached TruffleString.CodePointLengthNode codePointLengthNode, - @Cached TruffleString.FromByteArrayNode fromByteArrayNode, - @Cached TruffleString.SwitchEncodingNode switchEncodingNode) { + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromByteArrayNode) { // TODO GR-37220: rewrite using TruffleStringBuilder? TruffleString repr = reprNode.execute(frame, inliningTarget, obj); if (getCodeRangeNode.execute(repr, TS_ENCODING) == TruffleString.CodeRange.ASCII) { @@ -95,7 +93,7 @@ public static TruffleString ascii(VirtualFrame frame, Node inliningTarget, Objec int ch = nextNode.execute(it, TS_ENCODING); j = unicodeNonAsciiEscape(ch, j, bytes); } - return switchEncodingNode.execute(fromByteArrayNode.execute(bytes, 0, j, Encoding.US_ASCII, true), TS_ENCODING); + return fromByteArrayNode.execute(bytes, 0, j, TruffleString.CompactionLevel.S1, true); } @NeverDefault diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/util/CastToTruffleStringNode.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/util/CastToTruffleStringNode.java index b0d2b72d56..2d97f52b58 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/util/CastToTruffleStringNode.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/nodes/util/CastToTruffleStringNode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2022, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -72,7 +72,6 @@ import com.oracle.truffle.api.library.CachedLibrary; import com.oracle.truffle.api.nodes.Node; import com.oracle.truffle.api.strings.TruffleString; -import com.oracle.truffle.api.strings.TruffleString.Encoding; /** * Casts a Python string to a TruffleString without coercion. ATTENTION: If the cast fails, @@ -144,31 +143,31 @@ static TruffleString read(Object pointer, @Cached CStructAccess.ReadPointerNode readPointer, @Cached CStructAccess.ReadByteNode readByte, @CachedLibrary(limit = "3") InteropLibrary lib, - @Cached TruffleString.FromNativePointerNode fromNative, - @Cached TruffleString.FromByteArrayNode fromBytes) { + @Cached TruffleString.FromNativePointerWithCompactionUTF32Node fromNative, + @Cached TruffleString.FromByteArrayWithCompactionUTF32Node fromBytes) { int state = readI32.read(pointer, PyASCIIObject__state); int kind = (state >> CFields.PyASCIIObject__state_kind_shift) & 0x7; Object data = readPointer.read(pointer, PyUnicodeObject__data); long length = readI64.read(pointer, PyASCIIObject__length); - Encoding encoding; + TruffleString.CompactionLevel compactionLevel; if (kind == 1) { // isBitSet(state, PyASCIIObject__state_ascii_shift)) // ascii doesn't matter, codepoint 0-127 are the same in ascii and latin1 - encoding = Encoding.ISO_8859_1; + compactionLevel = TruffleString.CompactionLevel.S1; } else if (kind == 2) { - encoding = Encoding.UTF_16LE; + compactionLevel = TruffleString.CompactionLevel.S2; } else { assert kind == 4; - encoding = Encoding.UTF_32LE; + compactionLevel = TruffleString.CompactionLevel.S4; } int bytes = PythonUtils.toIntError(length * kind); if (lib.isPointer(data) || data instanceof Long) { - return fromNative.execute(data, 0, bytes, encoding, false); + return fromNative.execute(data, 0, bytes, compactionLevel, false); } byte[] result = readByte.readByteArray(data, bytes); - return fromBytes.execute(result, encoding, false); + return fromBytes.execute(result, 0, result.length, compactionLevel, false); } } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java index b1b36a4ec3..536aa98596 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/runtime/NFIPosixSupport.java @@ -118,6 +118,7 @@ import com.oracle.graal.python.util.FunctionWithSignature; import com.oracle.graal.python.util.OverflowException; import com.oracle.graal.python.util.PythonUtils; +import com.oracle.truffle.api.ArrayUtils; import com.oracle.truffle.api.CompilerAsserts; import com.oracle.truffle.api.CompilerDirectives; import com.oracle.truffle.api.CompilerDirectives.CompilationFinal; @@ -1919,7 +1920,8 @@ public TruffleString crypt(TruffleString word, TruffleString salt, @Shared("invoke") @Cached InvokeNativeFunction invokeNode, @Shared("toUtf8") @Cached TruffleString.SwitchEncodingNode switchEncodingToUtf8Node, @Shared("tsCopyBytes") @Cached TruffleString.CopyToByteArrayNode copyToByteArrayNode, - @Shared("tsFromBytes") @Cached TruffleString.FromByteArrayNode fromByteArrayNode, + @Cached TruffleString.FromZeroTerminatedNativePointerNode fromZeroTerminatedNativePointerNode, + @Cached TruffleString.AsManagedNode asManagedNode, @Shared("fromUtf8") @Cached TruffleString.SwitchEncodingNode switchEncodingFromUtf8Node) throws PosixException { /* * We don't want to link the posix library with libcrypt, because it might not be available @@ -1963,13 +1965,9 @@ public TruffleString crypt(TruffleString word, TruffleString salt, if (resultPtr == 0) { throw getErrnoAndThrowPosixException(invokeNode); } - int len = 0; - while (UNSAFE.getByte(resultPtr + len) != 0) { - len++; - } - byte[] resultBytes = new byte[len]; - UNSAFE.copyMemory(null, resultPtr, resultBytes, Unsafe.ARRAY_BYTE_BASE_OFFSET, len); - return createString(resultBytes, 0, resultBytes.length, false, fromByteArrayNode, switchEncodingFromUtf8Node); + // TODO PyUnicode_DecodeFSDefault + TruffleString utf8 = fromZeroTerminatedNativePointerNode.execute8Bit(resultPtr, 0, UTF_8, false); + return asManagedNode.execute(switchEncodingFromUtf8Node.execute(utf8, TS_ENCODING), TS_ENCODING); } } @@ -2276,13 +2274,7 @@ UnixSockAddr asUnixSockAddr() { pathBuf = PythonUtils.arrayCopyOfRange(data, pathOffset, pathOffset + linuxAddrLen); } else { // Regular NULL-terminated string - int pathLen = -1; - for (int i = pathOffset; i < data.length; i++) { - if (data[i] == '\0') { - pathLen = i - pathOffset; - break; - } - } + int pathLen = ArrayUtils.indexOf(data, pathOffset, data.length, (byte) 0) - pathOffset; assert pathLen >= 0; pathBuf = PythonUtils.arrayCopyOfRange(data, pathOffset, pathOffset + pathLen); } @@ -2546,11 +2538,8 @@ private static TruffleString extractZeroTerminatedString(byte[] buffer, long lon throw outOfMemoryPosixError(); } int offset = (int) longOffset; - int end = offset; - while (end < buffer.length && buffer[end] != '\0') { - end++; - } - if (end == buffer.length) { + int end = ArrayUtils.indexOf(buffer, offset, buffer.length, (byte) 0); + if (end < 0) { throw CompilerDirectives.shouldNotReachHere("Could not find the end of the string"); } // TODO PyUnicode_DecodeFSDefault diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java index ce80616b39..44271cac80 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/util/CharsetMapping.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * The Universal Permissive License (UPL), Version 1.0 @@ -43,7 +43,6 @@ import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING; import static com.oracle.graal.python.util.PythonUtils.toInternedTruffleStringUncached; import static com.oracle.graal.python.util.PythonUtils.toTruffleStringUncached; -import static com.oracle.graal.python.util.PythonUtils.tsLiteral; import java.nio.ByteOrder; import java.nio.charset.Charset; @@ -76,45 +75,40 @@ * Utility class for mapping Python encodings to Java charsets */ public class CharsetMapping { - private static final Charset UTF_32LE = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE"), ByteOrder.LITTLE_ENDIAN); - private static final Charset UTF_32LE_BOM = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE-BOM"), ByteOrder.LITTLE_ENDIAN); - private static final Charset UTF_32BE = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE"), ByteOrder.BIG_ENDIAN); - private static final Charset UTF_32BE_BOM = new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE-BOM"), ByteOrder.BIG_ENDIAN); - private static final ConcurrentMap JAVA_CHARSETS = new ConcurrentHashMap<>(); + + public enum BOM { + UTF_16LE(new byte[]{(byte) 0xff, (byte) 0xfe}), + UTF_16BE(new byte[]{(byte) 0xfe, (byte) 0xff}), + UTF_32LE(new byte[]{(byte) 0xff, (byte) 0xfe, 0, 0}), + UTF_32BE(new byte[]{0, 0, (byte) 0xfe, (byte) 0xff}); + + public final byte[] bytes; + + BOM(byte[] bytes) { + this.bytes = bytes; + } + } + + public static final CharsetWrapper UTF_16LE_BOM = new CharsetWrapper(Charset.forName("UnicodeLittle"), TruffleString.Encoding.UTF_16LE, BOM.UTF_16LE, 2); + public static final CharsetWrapper UTF_16BE_BOM = new CharsetWrapper(StandardCharsets.UTF_16, TruffleString.Encoding.UTF_16BE, BOM.UTF_16BE, 2); + + public static final CharsetWrapper UTF_32LE = new CharsetWrapper(new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE"), ByteOrder.LITTLE_ENDIAN), TruffleString.Encoding.UTF_32LE, null, 4); + public static final CharsetWrapper UTF_32LE_BOM = new CharsetWrapper(new PythonUTF32CharsetWrapper(Charset.forName("UTF-32LE-BOM"), ByteOrder.LITTLE_ENDIAN), TruffleString.Encoding.UTF_32LE, + BOM.UTF_32LE, 4); + public static final CharsetWrapper UTF_32BE = new CharsetWrapper(new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE"), ByteOrder.BIG_ENDIAN), TruffleString.Encoding.UTF_32BE, null, 4); + public static final CharsetWrapper UTF_32BE_BOM = new CharsetWrapper(new PythonUTF32CharsetWrapper(Charset.forName("UTF-32BE-BOM"), ByteOrder.BIG_ENDIAN), TruffleString.Encoding.UTF_32BE, + BOM.UTF_32BE, 4); + + private static final ConcurrentMap JAVA_CHARSETS = new ConcurrentHashMap<>(); // Name maps are populated by static initializer and are immutable afterwards private static final Map CHARSET_NAME_MAP = new HashMap<>(); private static final Map CHARSET_NAME_MAP_REVERSE = new HashMap<>(); - private static final TruffleString T_UTF_16_UNDERSCORE = tsLiteral("utf_16"); - private static final TruffleString T_UTF_32_UNDERSCORE = tsLiteral("utf_32"); - @TruffleBoundary - public static Charset getCharsetNormalized(TruffleString normalizedEncoding) { - String name = CHARSET_NAME_MAP.get(normalizedEncoding); - if (name != null) { - return getJavaCharset(name); - } - return null; + public record CharsetWrapper(Charset charset, TruffleString.Encoding tStringEncoding, BOM bom, int stride) { } @TruffleBoundary - public static Charset getCharsetForDecodingNormalized(TruffleString normalizedEncoding, byte[] bytes, int len) { - if (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN) { - /* - * JDK's charsets for UTF-16 and UTF-32 default to big endian irrespective of the - * platform if there is no BOM. The UTF-16-LE and UTF-32-LE charsets reject big endian - * BOM. CPython defaults to platform endian and accepts both BOMs. So, in order to get - * the behavior we need, we have to take a peek at the possible BOM and if it has a BOM - * use the UTF-16/32 encoding and let it detect, otherwise default to UTF-16/32-LE. - */ - if (T_UTF_16_UNDERSCORE.equalsUncached(normalizedEncoding, TS_ENCODING) && hasUTF16BOM(bytes, len)) { - return StandardCharsets.UTF_16; - } else if (T_UTF_32_UNDERSCORE.equalsUncached(normalizedEncoding, TS_ENCODING)) { - Charset charset = getUTF32CharsetForBOM(bytes, len); - if (charset != null) { - return charset; - } - } - } + public static CharsetWrapper getCharsetNormalized(TruffleString normalizedEncoding) { String name = CHARSET_NAME_MAP.get(normalizedEncoding); if (name != null) { return getJavaCharset(name); @@ -122,28 +116,6 @@ public static Charset getCharsetForDecodingNormalized(TruffleString normalizedEn return null; } - private static boolean hasUTF16BOM(byte[] bytes, int len) { - if (len < 2) { - return false; - } - short head = PythonUtils.ARRAY_ACCESSOR.getShort(bytes, 0); - return head == (short) 0xFFFE || head == (short) 0xFEFF; - } - - private static Charset getUTF32CharsetForBOM(byte[] bytes, int len) { - if (len < 4) { - return null; - } - int head = PythonUtils.ARRAY_ACCESSOR.getInt(bytes, 0); - if (head == 0xFFFE0000) { - return UTF_32BE_BOM; - } - if (head == 0x0000FEFF) { - return UTF_32LE_BOM; - } - return null; - } - @TruffleBoundary public static TruffleString getPythonEncodingNameFromJavaName(String javaEncodingName) { return CHARSET_NAME_MAP_REVERSE.get(javaEncodingName.toLowerCase()); @@ -188,19 +160,19 @@ public static TruffleString normalizeUncached(TruffleString encoding) { return NormalizeEncodingNameNodeGen.getUncached().execute(null, encoding); } - public static Charset getJavaCharset(String name) { + public static CharsetWrapper getJavaCharset(String name) { return JAVA_CHARSETS.computeIfAbsent(name, key -> { // Important: When adding additional ICU4J charset, the implementation class needs to be // added to reflect-config.json if (name.equals("UTF-7") || name.equals("HZ")) { try { - return CharsetICU.forNameICU(name); + return new CharsetWrapper(CharsetICU.forNameICU(name), null, null, 1); } catch (UnsupportedCharsetException e) { return null; } } else { try { - return Charset.forName(name); + return new CharsetWrapper(Charset.forName(name), null, null, 1); } catch (UnsupportedCharsetException e) { return null; } @@ -224,12 +196,12 @@ private static void addAlias(String alias, String pythonName) { static { // Pre-initialize standard charset entries - JAVA_CHARSETS.put("US-ASCII", StandardCharsets.US_ASCII); - JAVA_CHARSETS.put("ISO-8859-1", StandardCharsets.ISO_8859_1); - JAVA_CHARSETS.put("UTF-8", StandardCharsets.UTF_8); - JAVA_CHARSETS.put("UTF-16BE", StandardCharsets.UTF_16BE); - JAVA_CHARSETS.put("UTF-16LE", StandardCharsets.UTF_16LE); - JAVA_CHARSETS.put("UTF-16", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? Charset.forName("UnicodeLittle") : StandardCharsets.UTF_16); + JAVA_CHARSETS.put("US-ASCII", new CharsetWrapper(StandardCharsets.US_ASCII, TruffleString.Encoding.US_ASCII, null, 1)); + JAVA_CHARSETS.put("ISO-8859-1", new CharsetWrapper(StandardCharsets.ISO_8859_1, TruffleString.Encoding.ISO_8859_1, null, 1)); + JAVA_CHARSETS.put("UTF-8", new CharsetWrapper(StandardCharsets.UTF_8, TruffleString.Encoding.UTF_8, null, 1)); + JAVA_CHARSETS.put("UTF-16BE", new CharsetWrapper(StandardCharsets.UTF_16BE, TruffleString.Encoding.UTF_16BE, null, 2)); + JAVA_CHARSETS.put("UTF-16LE", new CharsetWrapper(StandardCharsets.UTF_16LE, TruffleString.Encoding.UTF_16LE, null, 2)); + JAVA_CHARSETS.put("UTF-16", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? UTF_16LE_BOM : UTF_16BE_BOM); JAVA_CHARSETS.put("UTF-32BE", UTF_32BE); JAVA_CHARSETS.put("UTF-32LE", UTF_32LE); JAVA_CHARSETS.put("UTF-32", ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? UTF_32LE_BOM : UTF_32BE_BOM); @@ -238,8 +210,8 @@ private static void addAlias(String alias, String pythonName) { addMapping("raw_unicode_escape", "x-python-raw-unicode-escape"); addMapping("unicode-escape", "x-python-unicode-escape"); addMapping("unicodeescape", "x-python-unicode-escape"); - JAVA_CHARSETS.put("x-python-raw-unicode-escape", new PythonRawUnicodeEscapeCharset()); - JAVA_CHARSETS.put("x-python-unicode-escape", new PythonUnicodeEscapeCharset()); + JAVA_CHARSETS.put("x-python-raw-unicode-escape", new CharsetWrapper(new PythonRawUnicodeEscapeCharset(), null, null, 1)); + JAVA_CHARSETS.put("x-python-unicode-escape", new CharsetWrapper(new PythonUnicodeEscapeCharset(), null, null, 1)); addMapping("ascii", "US-ASCII"); addMapping("big5hkscs", "Big5-HKSCS");