Skip to content

Commit 97a81a8

Browse files
committed
src: optimize utf-8 byte length calculation using simdutf
1 parent f6464c5 commit 97a81a8

File tree

1 file changed

+51
-40
lines changed

1 file changed

+51
-40
lines changed

src/node_buffer.cc

Lines changed: 51 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -761,9 +761,38 @@ void StringWrite(const FunctionCallbackInfo<Value>& args) {
761761
void SlowByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
762762
CHECK(args[0]->IsString());
763763

764-
// Fast case: avoid StringBytes on UTF8 string. Jump to v8.
765-
size_t result = args[0].As<String>()->Utf8LengthV2(args.GetIsolate());
766-
args.GetReturnValue().Set(static_cast<uint64_t>(result));
764+
Isolate* isolate = args.GetIsolate();
765+
Local<String> source = args[0].As<String>();
766+
767+
static constexpr int kSmallStringThreshold = 128;
768+
if (source->Length() <= kSmallStringThreshold) {
769+
size_t result = source->Utf8LengthV2(isolate);
770+
args.GetReturnValue().Set(static_cast<uint64_t>(result));
771+
return;
772+
}
773+
774+
String::ValueView view(isolate, source);
775+
size_t length = view.length();
776+
size_t utf8_length;
777+
778+
if (view.is_one_byte()) {
779+
auto data = reinterpret_cast<const char*>(view.data8());
780+
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
781+
if (result.error == simdutf::SUCCESS) {
782+
utf8_length = length; // Pure ASCII, length stays the same
783+
} else {
784+
utf8_length = simdutf::utf8_length_from_latin1(data, length);
785+
}
786+
} else {
787+
auto data = reinterpret_cast<const char16_t*>(view.data16());
788+
if (simdutf::validate_utf16(data, length)) {
789+
utf8_length = simdutf::utf8_length_from_utf16(data, length);
790+
} else {
791+
utf8_length = source->Utf8LengthV2(isolate);
792+
}
793+
}
794+
795+
args.GetReturnValue().Set(static_cast<uint64_t>(utf8_length));
767796
}
768797

769798
uint32_t FastByteLengthUtf8(
@@ -776,49 +805,31 @@ uint32_t FastByteLengthUtf8(
776805
CHECK(sourceValue->IsString());
777806
Local<String> sourceStr = sourceValue.As<String>();
778807

779-
if (!sourceStr->IsExternalOneByte()) {
808+
// For short inputs, use V8's path - function call overhead not worth it
809+
static constexpr int kSmallStringThreshold = 128;
810+
if (sourceStr->Length() <= kSmallStringThreshold) {
780811
return sourceStr->Utf8LengthV2(isolate);
781812
}
782-
auto source = sourceStr->GetExternalOneByteStringResource();
783-
// For short inputs, the function call overhead to simdutf is maybe
784-
// not worth it, reserve simdutf for long strings.
785-
if (source->length() > 128) {
786-
return simdutf::utf8_length_from_latin1(source->data(), source->length());
787-
}
788-
789-
uint32_t length = source->length();
790-
const auto input = reinterpret_cast<const uint8_t*>(source->data());
791-
792-
uint32_t answer = length;
793-
uint32_t i = 0;
794813

795-
auto pop = [](uint64_t v) {
796-
return static_cast<size_t>(((v >> 7) & UINT64_C(0x0101010101010101)) *
797-
UINT64_C(0x0101010101010101) >>
798-
56);
799-
};
814+
// For large strings, use simdutf with String::ValueView for direct access
815+
// This is ~6x faster for large strings
816+
String::ValueView view(isolate, sourceStr);
817+
size_t length = view.length();
800818

801-
for (; i + 32 <= length; i += 32) {
802-
uint64_t v;
803-
memcpy(&v, input + i, 8);
804-
answer += pop(v);
805-
memcpy(&v, input + i + 8, 8);
806-
answer += pop(v);
807-
memcpy(&v, input + i + 16, 8);
808-
answer += pop(v);
809-
memcpy(&v, input + i + 24, 8);
810-
answer += pop(v);
811-
}
812-
for (; i + 8 <= length; i += 8) {
813-
uint64_t v;
814-
memcpy(&v, input + i, 8);
815-
answer += pop(v);
816-
}
817-
for (; i + 1 <= length; i += 1) {
818-
answer += input[i] >> 7;
819+
if (view.is_one_byte()) {
820+
auto data = reinterpret_cast<const char*>(view.data8());
821+
simdutf::result result = simdutf::validate_ascii_with_errors(data, length);
822+
if (result.error == simdutf::SUCCESS) {
823+
return length; // Pure ASCII, length stays the same
824+
}
825+
return simdutf::utf8_length_from_latin1(data, length);
819826
}
820827

821-
return answer;
828+
auto data = reinterpret_cast<const char16_t*>(view.data16());
829+
if (simdutf::validate_utf16(data, length)) {
830+
return simdutf::utf8_length_from_utf16(data, length);
831+
}
832+
return sourceStr->Utf8LengthV2(isolate);
822833
}
823834

824835
static CFunction fast_byte_length_utf8(CFunction::Make(FastByteLengthUtf8));

0 commit comments

Comments
 (0)