Skip to content

Commit 4ae7a9d

Browse files
committed
lib: use utf8 fast path for streaming TextDecoder
1 parent e155415 commit 4ae7a9d

File tree

3 files changed

+122
-43
lines changed

3 files changed

+122
-43
lines changed

lib/internal/encoding.js

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,13 @@ const kHandle = Symbol('handle');
2828
const kFlags = Symbol('flags');
2929
const kEncoding = Symbol('encoding');
3030
const kDecoder = Symbol('decoder');
31+
const kChunk = Symbol('chunk');
3132
const kFatal = Symbol('kFatal');
3233
const kUTF8FastPath = Symbol('kUTF8FastPath');
3334
const kIgnoreBOM = Symbol('kIgnoreBOM');
3435

3536
const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte');
37+
const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util');
3638

3739
const {
3840
getConstructorOf,
@@ -447,9 +449,11 @@ class TextDecoder {
447449
this[kUTF8FastPath] = false;
448450
this[kHandle] = undefined;
449451
this[kSingleByte] = undefined; // Does not care about streaming or BOM
452+
this[kChunk] = null; // A copy of previous streaming tail or null
450453

451454
if (enc === 'utf-8') {
452455
this[kUTF8FastPath] = true;
456+
this[kBOMSeen] = false;
453457
} else if (isSinglebyteEncoding(enc)) {
454458
this[kSingleByte] = createSinglebyteDecoder(enc, this[kFatal]);
455459
} else {
@@ -483,8 +487,53 @@ class TextDecoder {
483487

484488
const stream = options?.stream;
485489
if (this[kUTF8FastPath]) {
486-
if (!stream) return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
487-
this[kUTF8FastPath] = false;
490+
const chunk = this[kChunk];
491+
const ignoreBom = this[kIgnoreBOM] || this[kBOMSeen];
492+
if (!stream) {
493+
this[kBOMSeen] = false;
494+
if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]);
495+
}
496+
497+
let u = parseInput(input);
498+
if (u.length === 0 && stream) return ''; // no state change
499+
let prefix;
500+
if (chunk) {
501+
const merged = mergePrefixUtf8(u, this[kChunk]);
502+
if (u.length < 3) {
503+
u = merged; // Might be unfinished, but fully consumed old u
504+
} else {
505+
prefix = merged; // Stops at complete chunk
506+
const add = prefix.length - this[kChunk].length;
507+
if (add > 0) u = u.subarray(add);
508+
}
509+
510+
this[kChunk] = null;
511+
}
512+
513+
if (stream) {
514+
const trail = unfinishedBytesUtf8(u, u.length);
515+
if (trail > 0) {
516+
this[kChunk] = new FastBuffer(u.subarray(-trail)); // copy
517+
if (!prefix && trail === u.length) return ''; // No further state change
518+
u = u.subarray(0, -trail);
519+
}
520+
}
521+
522+
try {
523+
const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') +
524+
decodeUTF8(u, ignoreBom || prefix, this[kFatal]);
525+
526+
// "BOM seen" is set on the current decode call only if it did not error,
527+
// in "serialize I/O queue" after decoding
528+
// We don't get here if we had no complete data to process,
529+
// and we don't want BOM processing after that if streaming
530+
if (stream) this[kBOMSeen] = true;
531+
return res;
532+
} catch (e) {
533+
this[kChunk] = null; // Reset unfinished chunk on errors
534+
// The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
535+
throw e;
536+
}
488537
}
489538

490539
this.#prepareConverter();

lib/internal/encoding/util.js

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// From https://npmjs.com/package/@exodus/bytes
2+
// Copyright Exodus Movement. Licensed under MIT License.
3+
4+
'use strict';
5+
6+
const {
7+
Uint8Array,
8+
} = primordials;
9+
10+
// Get a number of last bytes in an Uint8Array `u` ending at `len` that don't
11+
// form a codepoint yet, but can be a part of a single codepoint on more data
12+
function unfinishedBytesUtf8(u, len) {
13+
// 0-3
14+
let p = 0;
15+
while (p < 2 && p < len && (u[len - p - 1] & 0xc0) === 0x80) p++; // Go back 0-2 trailing bytes
16+
if (p === len) return 0; // no space for lead
17+
const l = u[len - p - 1];
18+
if (l < 0xc2 || l > 0xf4) return 0; // not a lead
19+
if (p === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here
20+
if (l < 0xe0 || (l < 0xf0 && p >= 2)) return 0; // 2-byte, or 3-byte or less and we already have 2 trailing
21+
const lower = l === 0xf0 ? 0x90 : l === 0xe0 ? 0xa0 : 0x80;
22+
const upper = l === 0xf4 ? 0x8f : l === 0xed ? 0x9f : 0xbf;
23+
const n = u[len - p];
24+
return n >= lower && n <= upper ? p + 1 : 0;
25+
}
26+
27+
// Merge prefix `chunk` with `u` and return new combined prefix
28+
// For u.length < 3, fully consumes u and can return unfinished data,
29+
// otherwise returns a prefix with no unfinished bytes
30+
function mergePrefixUtf8(u, chunk) {
31+
if (u.length === 0) return chunk;
32+
if (u.length < 3) {
33+
// No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
34+
const a = new Uint8Array(u.length + chunk.length);
35+
a.set(chunk);
36+
a.set(u, chunk.length);
37+
return a;
38+
}
39+
40+
// Slice off a small portion of u into prefix chunk so we can decode them separately without extending array size
41+
const t = new Uint8Array(chunk.length + 3); // We have 1-3 bytes and need 1-3 more bytes
42+
t.set(chunk);
43+
t.set(u.subarray(0, 3), chunk.length);
44+
45+
// Stop at the first offset where unfinished bytes reaches 0 or fits into u
46+
// If that doesn't happen (u too short), just concat chunk and u completely (above)
47+
for (let i = 1; i <= 3; i++) {
48+
const unfinished = unfinishedBytesUtf8(t, chunk.length + i); // 0-3
49+
if (unfinished <= i) {
50+
// Always reachable at 3, but we still need 'unfinished' value for it
51+
const add = i - unfinished; // 0-3
52+
return add > 0 ? t.subarray(0, chunk.length + add) : chunk;
53+
}
54+
}
55+
56+
// Unreachable
57+
}
58+
59+
module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 };

test/parallel/test-whatwg-encoding-custom-textdecoder.js

Lines changed: 12 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,8 @@ assert(TextDecoder);
8080

8181
['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => {
8282
const dec = new TextDecoder(i, { fatal: true });
83-
if (common.hasIntl) {
84-
dec.decode(buf.slice(0, 8), { stream: true });
85-
dec.decode(buf.slice(8));
86-
} else {
87-
assert.throws(
88-
() => {
89-
dec.decode(buf.slice(0, 8), { stream: true });
90-
},
91-
{
92-
code: 'ERR_NO_ICU',
93-
name: 'TypeError',
94-
message: '"fatal" option is not supported on Node.js compiled without ICU'
95-
});
96-
}
83+
dec.decode(buf.slice(0, 8), { stream: true });
84+
dec.decode(buf.slice(8));
9785
});
9886

9987
// Test TextDecoder, label undefined, options null
@@ -122,33 +110,16 @@ if (common.hasIntl) {
122110
// Test TextDecoder inspect with hidden fields
123111
{
124112
const dec = new TextDecoder('utf-8', { ignoreBOM: true });
125-
if (common.hasIntl) {
126-
assert.strictEqual(
127-
util.inspect(dec, { showHidden: true }),
128-
'TextDecoder {\n' +
129-
' encoding: \'utf-8\',\n' +
130-
' fatal: false,\n' +
131-
' ignoreBOM: true,\n' +
132-
' Symbol(flags): 4,\n' +
133-
' Symbol(handle): undefined\n' +
134-
'}'
135-
);
136-
} else {
137-
dec.decode(Uint8Array.of(0), { stream: true });
138-
assert.strictEqual(
139-
util.inspect(dec, { showHidden: true }),
140-
'TextDecoder {\n' +
141-
" encoding: 'utf-8',\n" +
142-
' fatal: false,\n' +
143-
' ignoreBOM: true,\n' +
144-
' Symbol(flags): 4,\n' +
145-
' Symbol(handle): StringDecoder {\n' +
146-
" encoding: 'utf8',\n" +
147-
' Symbol(kNativeDecoder): <Buffer 00 00 00 00 00 00 01>\n' +
148-
' }\n' +
149-
'}'
150-
);
151-
}
113+
assert.strictEqual(
114+
util.inspect(dec, { showHidden: true }),
115+
'TextDecoder {\n' +
116+
' encoding: \'utf-8\',\n' +
117+
' fatal: false,\n' +
118+
' ignoreBOM: true,\n' +
119+
' Symbol(flags): 4,\n' +
120+
' Symbol(handle): undefined\n' +
121+
'}'
122+
);
152123
}
153124

154125

0 commit comments

Comments
 (0)