diff --git a/Core/GameEngine/Source/Common/System/AsciiString.cpp b/Core/GameEngine/Source/Common/System/AsciiString.cpp
index 533ccd5ccf3..214eabb7192 100644
--- a/Core/GameEngine/Source/Common/System/AsciiString.cpp
+++ b/Core/GameEngine/Source/Common/System/AsciiString.cpp
@@ -45,6 +45,7 @@
#include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine
#include "Common/CriticalSection.h"
+#include "utf8.h"
// -----------------------------------------------------
@@ -137,8 +138,8 @@ void AsciiString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveData
// TheSuperHackers @fix Mauller 04/04/2025 Replace strcpy with safer memmove as memory regions can overlap when part of string is copied to itself
DEBUG_ASSERTCRASH(usableNumChars <= strlen(strToCopy), ("strToCopy is too small"));
memmove(m_data->peek(), strToCopy, usableNumChars);
- m_data->peek()[usableNumChars] = 0;
}
+ m_data->peek()[usableNumChars] = 0;
if (strToCat)
strcat(m_data->peek(), strToCat);
return;
@@ -166,8 +167,8 @@ void AsciiString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveData
{
DEBUG_ASSERTCRASH(usableNumChars <= strlen(strToCopy), ("strToCopy is too small"));
strncpy(newData->peek(), strToCopy, usableNumChars);
- newData->peek()[usableNumChars] = 0;
}
+ newData->peek()[usableNumChars] = 0;
if (strToCat)
strcat(newData->peek(), strToCat);
@@ -272,11 +273,19 @@ char* AsciiString::getBufferForRead(Int len)
void AsciiString::translate(const UnicodeString& stringSrc)
{
validate();
- /// @todo srj put in a real translation here; this will only work for 7-bit ascii
- clear();
- Int len = stringSrc.getLength();
- for (Int i = 0; i < len; i++)
- concat((char)stringSrc.getCharAt(i));
+ // TheSuperHackers @fix bobtista 02/04/2026 Implement UTF-8 conversion replacing 7-bit ASCII only implementation
+ const WideChar* src = stringSrc.str();
+ size_t srcLen = wcslen(src);
+ size_t len = Get_Utf8_Len(src, srcLen);
+ if (len == 0)
+ {
+ clear();
+ return;
+ }
+ ensureUniqueBufferOfSize((Int)len + 1, false, nullptr, nullptr);
+ char* buf = peek();
+ if (Unicode_To_Utf8(buf, len + 1, src, srcLen) == 0)
+ clear();
validate();
}
diff --git a/Core/GameEngine/Source/Common/System/UnicodeString.cpp b/Core/GameEngine/Source/Common/System/UnicodeString.cpp
index 386778d321b..82339b58985 100644
--- a/Core/GameEngine/Source/Common/System/UnicodeString.cpp
+++ b/Core/GameEngine/Source/Common/System/UnicodeString.cpp
@@ -45,6 +45,7 @@
#include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine
#include "Common/CriticalSection.h"
+#include "utf8.h"
// -----------------------------------------------------
@@ -88,8 +89,8 @@ void UnicodeString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveDa
// TheSuperHackers @fix Mauller 04/04/2025 Replace wcscpy with safer memmove as memory regions can overlap when part of string is copied to itself
DEBUG_ASSERTCRASH(usableNumChars <= wcslen(strToCopy), ("strToCopy is too small"));
memmove(m_data->peek(), strToCopy, usableNumChars * sizeof(WideChar));
- m_data->peek()[usableNumChars] = 0;
}
+ m_data->peek()[usableNumChars] = 0;
if (strToCat)
wcscat(m_data->peek(), strToCat);
return;
@@ -117,8 +118,8 @@ void UnicodeString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveDa
{
DEBUG_ASSERTCRASH(usableNumChars <= wcslen(strToCopy), ("strToCopy is too small"));
wcsncpy(newData->peek(), strToCopy, usableNumChars);
- newData->peek()[usableNumChars] = 0;
}
+ newData->peek()[usableNumChars] = 0;
if (strToCat)
wcscat(newData->peek(), strToCat);
@@ -221,11 +222,19 @@ WideChar* UnicodeString::getBufferForRead(Int len)
void UnicodeString::translate(const AsciiString& stringSrc)
{
validate();
- /// @todo srj put in a real translation here; this will only work for 7-bit ascii
- clear();
- Int len = stringSrc.getLength();
- for (Int i = 0; i < len; i++)
- concat((WideChar)stringSrc.getCharAt(i));
+ // TheSuperHackers @fix bobtista 02/04/2026 Implement UTF-8 conversion replacing 7-bit ASCII only implementation
+ const char* src = stringSrc.str();
+ size_t srcLen = strlen(src);
+ size_t len = Get_Unicode_Len(src, srcLen);
+ if (len == 0)
+ {
+ clear();
+ return;
+ }
+ ensureUniqueBufferOfSize((Int)len + 1, false, nullptr, nullptr);
+ WideChar* buf = peek();
+ if (Utf8_To_Unicode(buf, len + 1, src, srcLen) == 0)
+ clear();
validate();
}
diff --git a/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp b/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp
index c7252e2cabb..d6dce7b4b56 100644
--- a/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp
+++ b/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp
@@ -28,18 +28,24 @@
#include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine
+#include "utf8.h"
+
//-------------------------------------------------------------------------
+// TheSuperHackers @refactor bobtista 02/04/2026 Use WWLib UTF-8 functions instead of raw Win32 API calls
std::wstring MultiByteToWideCharSingleLine( const char *orig )
{
- Int len = strlen(orig);
- WideChar *dest = NEW WideChar[len+1];
-
- MultiByteToWideChar(CP_UTF8, 0, orig, -1, dest, len);
+ size_t srcLen = strlen(orig);
+ size_t len = Get_Unicode_Len(orig, srcLen);
+ if (len == 0)
+ return std::wstring();
+ std::wstring ret;
+ ret.resize(len);
+ Utf8_To_Unicode(&ret[0], len, orig, srcLen);
WideChar *c = nullptr;
do
{
- c = wcschr(dest, L'\n');
+ c = wcschr(&ret[0], L'\n');
if (c)
{
*c = L' ';
@@ -48,7 +54,7 @@ std::wstring MultiByteToWideCharSingleLine( const char *orig )
while ( c != nullptr );
do
{
- c = wcschr(dest, L'\r');
+ c = wcschr(&ret[0], L'\r');
if (c)
{
*c = L' ';
@@ -56,24 +62,18 @@ std::wstring MultiByteToWideCharSingleLine( const char *orig )
}
while ( c != nullptr );
- dest[len] = 0;
- std::wstring ret = dest;
- delete[] dest;
return ret;
}
std::string WideCharStringToMultiByte( const WideChar *orig )
{
+ size_t srcLen = wcslen(orig);
+ size_t len = Get_Utf8_Len(orig, srcLen);
+ if (len == 0)
+ return std::string();
std::string ret;
- Int len = WideCharToMultiByte( CP_UTF8, 0, orig, wcslen(orig), nullptr, 0, nullptr, nullptr ) + 1;
- if (len > 0)
- {
- char *dest = NEW char[len];
- WideCharToMultiByte( CP_UTF8, 0, orig, -1, dest, len, nullptr, nullptr );
- dest[len-1] = 0;
- ret = dest;
- delete[] dest;
- }
+ ret.resize(len);
+ Unicode_To_Utf8(&ret[0], len, orig, srcLen);
return ret;
}
diff --git a/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt b/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt
index 228506fee0e..ab42d64714b 100644
--- a/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt
+++ b/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt
@@ -133,6 +133,8 @@ set(WWLIB_SRC
trim.cpp
trim.h
uarray.h
+ utf8.cpp
+ utf8.h
vector.cpp
Vector.h
visualc.h
diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp
new file mode 100644
index 00000000000..fb8c6157ba4
--- /dev/null
+++ b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp
@@ -0,0 +1,144 @@
+/*
+** Command & Conquer Generals Zero Hour(tm)
+** Copyright 2026 TheSuperHackers
+**
+** This program is free software: you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation, either version 3 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program. If not, see .
+*/
+
+#include "always.h"
+#include "utf8.h"
+
+#include
+
+#ifdef _WIN32
+#include
+
+static bool Is_Trail_Byte(char c)
+{
+ return (c & 0xC0) == 0x80;
+}
+
+size_t Utf8_Num_Bytes(char lead)
+{
+ if ((lead & 0x80) == 0x00) return 1;
+ if ((lead & 0xE0) == 0xC0) return 2;
+ if ((lead & 0xF0) == 0xE0) return 3;
+ if ((lead & 0xF8) == 0xF0) return 4;
+ return 0;
+}
+
+size_t Utf8_Trailing_Invalid_Bytes(const char* str, size_t length)
+{
+ if (length == 0)
+ return 0;
+
+ size_t i = length;
+ while (i > 0 && Is_Trail_Byte(str[i - 1]))
+ --i;
+
+ if (i == 0)
+ return length;
+
+ size_t claimed = Utf8_Num_Bytes(str[i - 1]);
+ size_t actual = length - (i - 1);
+
+ if (claimed == 0 || claimed != actual)
+ return actual;
+
+ return 0;
+}
+
+bool Utf8_Validate(const char* str)
+{
+ return Utf8_Validate(str, strlen(str));
+}
+
+bool Utf8_Validate(const char* str, size_t length)
+{
+ const unsigned char* s = (const unsigned char*)str;
+ size_t i = 0;
+ while (i < length)
+ {
+ size_t bytes = Utf8_Num_Bytes(str[i]);
+ if (bytes == 0)
+ return false;
+ if (i + bytes > length)
+ return false;
+ for (size_t j = 1; j < bytes; ++j)
+ {
+ if (!Is_Trail_Byte(str[i + j]))
+ return false;
+ }
+ // Reject overlong encodings per RFC 3629
+ if (bytes == 2 && s[i] < 0xC2)
+ return false;
+ if (bytes == 3 && s[i] == 0xE0 && s[i + 1] < 0xA0)
+ return false;
+ if (bytes == 4 && s[i] == 0xF0 && s[i + 1] < 0x90)
+ return false;
+ // Reject codepoints above U+10FFFF
+ if (bytes == 4 && s[i] > 0xF4)
+ return false;
+ if (bytes == 4 && s[i] == 0xF4 && s[i + 1] > 0x8F)
+ return false;
+ i += bytes;
+ }
+ return true;
+}
+
+size_t Get_Utf8_Len(const wchar_t* src, size_t srcLen)
+{
+ int bytes = WideCharToMultiByte(CP_UTF8, 0, src, (int)srcLen, nullptr, 0, nullptr, nullptr);
+ return (bytes > 0) ? (size_t)bytes : 0;
+}
+
+size_t Get_Unicode_Len(const char* src, size_t srcLen)
+{
+ int wchars = MultiByteToWideChar(CP_UTF8, 0, src, (int)srcLen, nullptr, 0);
+ return (wchars > 0) ? (size_t)wchars : 0;
+}
+
+size_t Unicode_To_Utf8(char* dest, size_t destLen, const wchar_t* src, size_t srcLen)
+{
+ if (destLen == 0)
+ return 0;
+ int written = WideCharToMultiByte(CP_UTF8, 0, src, (int)srcLen, dest, (int)destLen, nullptr, nullptr);
+ if (written == 0)
+ {
+ dest[0] = '\0';
+ return 0;
+ }
+ if ((size_t)written < destLen)
+ dest[written] = '\0';
+ return (size_t)written;
+}
+
+size_t Utf8_To_Unicode(wchar_t* dest, size_t destLen, const char* src, size_t srcLen)
+{
+ if (destLen == 0)
+ return 0;
+ int written = MultiByteToWideChar(CP_UTF8, 0, src, (int)srcLen, dest, (int)destLen);
+ if (written == 0)
+ {
+ dest[0] = L'\0';
+ return 0;
+ }
+ if ((size_t)written < destLen)
+ dest[written] = L'\0';
+ return (size_t)written;
+}
+
+#else
+#error "Not implemented"
+#endif
diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.h b/Core/Libraries/Source/WWVegas/WWLib/utf8.h
new file mode 100644
index 00000000000..433d01b3022
--- /dev/null
+++ b/Core/Libraries/Source/WWVegas/WWLib/utf8.h
@@ -0,0 +1,57 @@
+/*
+** Command & Conquer Generals Zero Hour(tm)
+** Copyright 2026 TheSuperHackers
+**
+** This program is free software: you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation, either version 3 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program. If not, see .
+*/
+
+#pragma once
+
+#include
+#include
+
+// Returns the number of bytes in a UTF-8 character based on its lead byte.
+// Returns 0 if the lead byte is invalid.
+size_t Utf8_Num_Bytes(char lead);
+
+// Returns the number of invalid bytes at the end of the string due to an
+// incomplete multi-byte sequence. Returns 0 if the string ends on a complete sequence.
+size_t Utf8_Trailing_Invalid_Bytes(const char* str, size_t length);
+
+// Returns true if the null-terminated string is valid UTF-8, false otherwise.
+bool Utf8_Validate(const char* str);
+bool Utf8_Validate(const char* str, size_t length);
+
+// Returns the number of bytes needed for the UTF-8 representation of srcLen wide
+// characters from src, not counting a null terminator. Returns 0 on failure or if srcLen is 0.
+size_t Get_Utf8_Len(const wchar_t* src, size_t srcLen);
+
+// Returns the number of wchar_t elements needed for the wide character representation
+// of srcLen bytes from the UTF-8 string src, not counting a null terminator.
+// Returns 0 on failure or if srcLen is 0.
+size_t Get_Unicode_Len(const char* src, size_t srcLen);
+
+// Converts srcLen wide characters from src to UTF-8.
+// destLen is the destination buffer capacity in bytes, not counting a null terminator.
+// Returns the number of bytes written on success, or 0 on failure.
+// Writes a null terminator if destLen > bytes written. Does not write one if destLen
+// equals bytes written (exact fit). On failure, dest[0] is set to '\0' if destLen > 0.
+size_t Unicode_To_Utf8(char* dest, size_t destLen, const wchar_t* src, size_t srcLen);
+
+// Converts srcLen bytes from the UTF-8 string src to wide characters.
+// destLen is the destination buffer capacity in wchar_t elements, not counting a null terminator.
+// Returns the number of wchar_t elements written on success, or 0 on failure.
+// Writes a null terminator if destLen > elements written. Does not write one if destLen
+// equals elements written (exact fit). On failure, dest[0] is set to L'\0' if destLen > 0.
+size_t Utf8_To_Unicode(wchar_t* dest, size_t destLen, const char* src, size_t srcLen);