diff --git a/Core/GameEngine/Source/Common/System/AsciiString.cpp b/Core/GameEngine/Source/Common/System/AsciiString.cpp index 533ccd5ccf3..214eabb7192 100644 --- a/Core/GameEngine/Source/Common/System/AsciiString.cpp +++ b/Core/GameEngine/Source/Common/System/AsciiString.cpp @@ -45,6 +45,7 @@ #include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine #include "Common/CriticalSection.h" +#include "utf8.h" // ----------------------------------------------------- @@ -137,8 +138,8 @@ void AsciiString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveData // TheSuperHackers @fix Mauller 04/04/2025 Replace strcpy with safer memmove as memory regions can overlap when part of string is copied to itself DEBUG_ASSERTCRASH(usableNumChars <= strlen(strToCopy), ("strToCopy is too small")); memmove(m_data->peek(), strToCopy, usableNumChars); - m_data->peek()[usableNumChars] = 0; } + m_data->peek()[usableNumChars] = 0; if (strToCat) strcat(m_data->peek(), strToCat); return; @@ -166,8 +167,8 @@ void AsciiString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveData { DEBUG_ASSERTCRASH(usableNumChars <= strlen(strToCopy), ("strToCopy is too small")); strncpy(newData->peek(), strToCopy, usableNumChars); - newData->peek()[usableNumChars] = 0; } + newData->peek()[usableNumChars] = 0; if (strToCat) strcat(newData->peek(), strToCat); @@ -272,11 +273,19 @@ char* AsciiString::getBufferForRead(Int len) void AsciiString::translate(const UnicodeString& stringSrc) { validate(); - /// @todo srj put in a real translation here; this will only work for 7-bit ascii - clear(); - Int len = stringSrc.getLength(); - for (Int i = 0; i < len; i++) - concat((char)stringSrc.getCharAt(i)); + // TheSuperHackers @fix bobtista 02/04/2026 Implement UTF-8 conversion replacing 7-bit ASCII only implementation + const WideChar* src = stringSrc.str(); + size_t srcLen = wcslen(src); + size_t len = Get_Utf8_Len(src, srcLen); + if (len == 0) + { + clear(); + return; + } + ensureUniqueBufferOfSize((Int)len + 1, false, nullptr, nullptr); + char* buf = peek(); + if (Unicode_To_Utf8(buf, len + 1, src, srcLen) == 0) + clear(); validate(); } diff --git a/Core/GameEngine/Source/Common/System/UnicodeString.cpp b/Core/GameEngine/Source/Common/System/UnicodeString.cpp index 386778d321b..82339b58985 100644 --- a/Core/GameEngine/Source/Common/System/UnicodeString.cpp +++ b/Core/GameEngine/Source/Common/System/UnicodeString.cpp @@ -45,6 +45,7 @@ #include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine #include "Common/CriticalSection.h" +#include "utf8.h" // ----------------------------------------------------- @@ -88,8 +89,8 @@ void UnicodeString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveDa // TheSuperHackers @fix Mauller 04/04/2025 Replace wcscpy with safer memmove as memory regions can overlap when part of string is copied to itself DEBUG_ASSERTCRASH(usableNumChars <= wcslen(strToCopy), ("strToCopy is too small")); memmove(m_data->peek(), strToCopy, usableNumChars * sizeof(WideChar)); - m_data->peek()[usableNumChars] = 0; } + m_data->peek()[usableNumChars] = 0; if (strToCat) wcscat(m_data->peek(), strToCat); return; @@ -117,8 +118,8 @@ void UnicodeString::ensureUniqueBufferOfSize(int numCharsNeeded, Bool preserveDa { DEBUG_ASSERTCRASH(usableNumChars <= wcslen(strToCopy), ("strToCopy is too small")); wcsncpy(newData->peek(), strToCopy, usableNumChars); - newData->peek()[usableNumChars] = 0; } + newData->peek()[usableNumChars] = 0; if (strToCat) wcscat(newData->peek(), strToCat); @@ -221,11 +222,19 @@ WideChar* UnicodeString::getBufferForRead(Int len) void UnicodeString::translate(const AsciiString& stringSrc) { validate(); - /// @todo srj put in a real translation here; this will only work for 7-bit ascii - clear(); - Int len = stringSrc.getLength(); - for (Int i = 0; i < len; i++) - concat((WideChar)stringSrc.getCharAt(i)); + // TheSuperHackers @fix bobtista 02/04/2026 Implement UTF-8 conversion replacing 7-bit ASCII only implementation + const char* src = stringSrc.str(); + size_t srcLen = strlen(src); + size_t len = Get_Unicode_Len(src, srcLen); + if (len == 0) + { + clear(); + return; + } + ensureUniqueBufferOfSize((Int)len + 1, false, nullptr, nullptr); + WideChar* buf = peek(); + if (Utf8_To_Unicode(buf, len + 1, src, srcLen) == 0) + clear(); validate(); } diff --git a/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp b/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp index c7252e2cabb..d6dce7b4b56 100644 --- a/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp +++ b/Core/GameEngine/Source/GameNetwork/GameSpy/Thread/ThreadUtils.cpp @@ -28,18 +28,24 @@ #include "PreRTS.h" // This must go first in EVERY cpp file in the GameEngine +#include "utf8.h" + //------------------------------------------------------------------------- +// TheSuperHackers @refactor bobtista 02/04/2026 Use WWLib UTF-8 functions instead of raw Win32 API calls std::wstring MultiByteToWideCharSingleLine( const char *orig ) { - Int len = strlen(orig); - WideChar *dest = NEW WideChar[len+1]; - - MultiByteToWideChar(CP_UTF8, 0, orig, -1, dest, len); + size_t srcLen = strlen(orig); + size_t len = Get_Unicode_Len(orig, srcLen); + if (len == 0) + return std::wstring(); + std::wstring ret; + ret.resize(len); + Utf8_To_Unicode(&ret[0], len, orig, srcLen); WideChar *c = nullptr; do { - c = wcschr(dest, L'\n'); + c = wcschr(&ret[0], L'\n'); if (c) { *c = L' '; @@ -48,7 +54,7 @@ std::wstring MultiByteToWideCharSingleLine( const char *orig ) while ( c != nullptr ); do { - c = wcschr(dest, L'\r'); + c = wcschr(&ret[0], L'\r'); if (c) { *c = L' '; @@ -56,24 +62,18 @@ std::wstring MultiByteToWideCharSingleLine( const char *orig ) } while ( c != nullptr ); - dest[len] = 0; - std::wstring ret = dest; - delete[] dest; return ret; } std::string WideCharStringToMultiByte( const WideChar *orig ) { + size_t srcLen = wcslen(orig); + size_t len = Get_Utf8_Len(orig, srcLen); + if (len == 0) + return std::string(); std::string ret; - Int len = WideCharToMultiByte( CP_UTF8, 0, orig, wcslen(orig), nullptr, 0, nullptr, nullptr ) + 1; - if (len > 0) - { - char *dest = NEW char[len]; - WideCharToMultiByte( CP_UTF8, 0, orig, -1, dest, len, nullptr, nullptr ); - dest[len-1] = 0; - ret = dest; - delete[] dest; - } + ret.resize(len); + Unicode_To_Utf8(&ret[0], len, orig, srcLen); return ret; } diff --git a/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt b/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt index 228506fee0e..ab42d64714b 100644 --- a/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt +++ b/Core/Libraries/Source/WWVegas/WWLib/CMakeLists.txt @@ -133,6 +133,8 @@ set(WWLIB_SRC trim.cpp trim.h uarray.h + utf8.cpp + utf8.h vector.cpp Vector.h visualc.h diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp new file mode 100644 index 00000000000..fb8c6157ba4 --- /dev/null +++ b/Core/Libraries/Source/WWVegas/WWLib/utf8.cpp @@ -0,0 +1,144 @@ +/* +** Command & Conquer Generals Zero Hour(tm) +** Copyright 2026 TheSuperHackers +** +** This program is free software: you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation, either version 3 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program. If not, see . +*/ + +#include "always.h" +#include "utf8.h" + +#include + +#ifdef _WIN32 +#include + +static bool Is_Trail_Byte(char c) +{ + return (c & 0xC0) == 0x80; +} + +size_t Utf8_Num_Bytes(char lead) +{ + if ((lead & 0x80) == 0x00) return 1; + if ((lead & 0xE0) == 0xC0) return 2; + if ((lead & 0xF0) == 0xE0) return 3; + if ((lead & 0xF8) == 0xF0) return 4; + return 0; +} + +size_t Utf8_Trailing_Invalid_Bytes(const char* str, size_t length) +{ + if (length == 0) + return 0; + + size_t i = length; + while (i > 0 && Is_Trail_Byte(str[i - 1])) + --i; + + if (i == 0) + return length; + + size_t claimed = Utf8_Num_Bytes(str[i - 1]); + size_t actual = length - (i - 1); + + if (claimed == 0 || claimed != actual) + return actual; + + return 0; +} + +bool Utf8_Validate(const char* str) +{ + return Utf8_Validate(str, strlen(str)); +} + +bool Utf8_Validate(const char* str, size_t length) +{ + const unsigned char* s = (const unsigned char*)str; + size_t i = 0; + while (i < length) + { + size_t bytes = Utf8_Num_Bytes(str[i]); + if (bytes == 0) + return false; + if (i + bytes > length) + return false; + for (size_t j = 1; j < bytes; ++j) + { + if (!Is_Trail_Byte(str[i + j])) + return false; + } + // Reject overlong encodings per RFC 3629 + if (bytes == 2 && s[i] < 0xC2) + return false; + if (bytes == 3 && s[i] == 0xE0 && s[i + 1] < 0xA0) + return false; + if (bytes == 4 && s[i] == 0xF0 && s[i + 1] < 0x90) + return false; + // Reject codepoints above U+10FFFF + if (bytes == 4 && s[i] > 0xF4) + return false; + if (bytes == 4 && s[i] == 0xF4 && s[i + 1] > 0x8F) + return false; + i += bytes; + } + return true; +} + +size_t Get_Utf8_Len(const wchar_t* src, size_t srcLen) +{ + int bytes = WideCharToMultiByte(CP_UTF8, 0, src, (int)srcLen, nullptr, 0, nullptr, nullptr); + return (bytes > 0) ? (size_t)bytes : 0; +} + +size_t Get_Unicode_Len(const char* src, size_t srcLen) +{ + int wchars = MultiByteToWideChar(CP_UTF8, 0, src, (int)srcLen, nullptr, 0); + return (wchars > 0) ? (size_t)wchars : 0; +} + +size_t Unicode_To_Utf8(char* dest, size_t destLen, const wchar_t* src, size_t srcLen) +{ + if (destLen == 0) + return 0; + int written = WideCharToMultiByte(CP_UTF8, 0, src, (int)srcLen, dest, (int)destLen, nullptr, nullptr); + if (written == 0) + { + dest[0] = '\0'; + return 0; + } + if ((size_t)written < destLen) + dest[written] = '\0'; + return (size_t)written; +} + +size_t Utf8_To_Unicode(wchar_t* dest, size_t destLen, const char* src, size_t srcLen) +{ + if (destLen == 0) + return 0; + int written = MultiByteToWideChar(CP_UTF8, 0, src, (int)srcLen, dest, (int)destLen); + if (written == 0) + { + dest[0] = L'\0'; + return 0; + } + if ((size_t)written < destLen) + dest[written] = L'\0'; + return (size_t)written; +} + +#else +#error "Not implemented" +#endif diff --git a/Core/Libraries/Source/WWVegas/WWLib/utf8.h b/Core/Libraries/Source/WWVegas/WWLib/utf8.h new file mode 100644 index 00000000000..433d01b3022 --- /dev/null +++ b/Core/Libraries/Source/WWVegas/WWLib/utf8.h @@ -0,0 +1,57 @@ +/* +** Command & Conquer Generals Zero Hour(tm) +** Copyright 2026 TheSuperHackers +** +** This program is free software: you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation, either version 3 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program. If not, see . +*/ + +#pragma once + +#include +#include + +// Returns the number of bytes in a UTF-8 character based on its lead byte. +// Returns 0 if the lead byte is invalid. +size_t Utf8_Num_Bytes(char lead); + +// Returns the number of invalid bytes at the end of the string due to an +// incomplete multi-byte sequence. Returns 0 if the string ends on a complete sequence. +size_t Utf8_Trailing_Invalid_Bytes(const char* str, size_t length); + +// Returns true if the null-terminated string is valid UTF-8, false otherwise. +bool Utf8_Validate(const char* str); +bool Utf8_Validate(const char* str, size_t length); + +// Returns the number of bytes needed for the UTF-8 representation of srcLen wide +// characters from src, not counting a null terminator. Returns 0 on failure or if srcLen is 0. +size_t Get_Utf8_Len(const wchar_t* src, size_t srcLen); + +// Returns the number of wchar_t elements needed for the wide character representation +// of srcLen bytes from the UTF-8 string src, not counting a null terminator. +// Returns 0 on failure or if srcLen is 0. +size_t Get_Unicode_Len(const char* src, size_t srcLen); + +// Converts srcLen wide characters from src to UTF-8. +// destLen is the destination buffer capacity in bytes, not counting a null terminator. +// Returns the number of bytes written on success, or 0 on failure. +// Writes a null terminator if destLen > bytes written. Does not write one if destLen +// equals bytes written (exact fit). On failure, dest[0] is set to '\0' if destLen > 0. +size_t Unicode_To_Utf8(char* dest, size_t destLen, const wchar_t* src, size_t srcLen); + +// Converts srcLen bytes from the UTF-8 string src to wide characters. +// destLen is the destination buffer capacity in wchar_t elements, not counting a null terminator. +// Returns the number of wchar_t elements written on success, or 0 on failure. +// Writes a null terminator if destLen > elements written. Does not write one if destLen +// equals elements written (exact fit). On failure, dest[0] is set to L'\0' if destLen > 0. +size_t Utf8_To_Unicode(wchar_t* dest, size_t destLen, const char* src, size_t srcLen);