From 9327bdff471e09a1c4d8c2eb1f603db3b5ab095a Mon Sep 17 00:00:00 2001
From: Chad Ongstad <chad@kodda.io>
Date: Mon, 16 Feb 2026 17:47:54 +0000
Subject: [PATCH] Fixed bug causing character data corruption in direct path
 load on non-UTF-8 databases

---
 .gitignore                                    |  1 +
 src/oracledb/errors.py                        |  5 ++
 .../thin/messages/direct_path_load_stream.pyx | 29 +++++++
 .../thin/messages/direct_path_prepare.pyx     |  2 +-
 src/oracledb/impl/thin/utils.pyx              | 77 +++++++++++++++++++
 tests/test_9600_direct_path_load.py           | 39 ++++++++++
 tests/test_9700_direct_path_load_async.py     | 41 ++++++++++
 7 files changed, 193 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 1339436c..8cea107b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ tests/ext/config.ini
 samples/sample.csv
 samples/sample.parquet
 samples/notebooks/testwrite.csv
+*.so
diff --git a/src/oracledb/errors.py b/src/oracledb/errors.py
index 44d406f7..9cee461e 100644
--- a/src/oracledb/errors.py
+++ b/src/oracledb/errors.py
@@ -306,6 +306,7 @@ def _raise_not_supported(feature: str) -> None:
 ERR_UNSUPPORTED_INBAND_NOTIFICATION = 3008
 ERR_SELF_BIND_NOT_SUPPORTED = 3009
 ERR_SERVER_VERSION_NOT_SUPPORTED = 3010
+ERR_DB_CS_NOT_SUPPORTED = 3011
 ERR_NCHAR_CS_NOT_SUPPORTED = 3012
 ERR_UNSUPPORTED_PYTHON_TYPE_FOR_DB_TYPE = 3013
 ERR_LOB_OF_WRONG_TYPE = 3014
@@ -821,6 +822,10 @@ def _raise_not_supported(feature: str) -> None:
     ERR_NAMED_TIMEZONE_NOT_SUPPORTED: (
         "named time zones are not supported in thin mode"
     ),
+    ERR_DB_CS_NOT_SUPPORTED: (
+        "database character set id {charset_id} is not supported by "
+        "python-oracledb for direct path loading in thin mode"
+    ),
     ERR_NCHAR_CS_NOT_SUPPORTED: (
         "national character set id {charset_id} is not supported by "
         "python-oracledb in thin mode"
diff --git a/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx b/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx
index d2a7fb60..daba79a7 100644
--- a/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx
+++ b/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx
@@ -285,6 +285,9 @@ cdef class DirectPathLoadStreamMessage(Message):
             PieceBuffer buf
             OracleData data
             ssize_t col_num
+            uint8_t ora_type_num
+            str db_charset_encoding
+            bytes temp_bytes
 
         # create buffer used for writing column data
         buf = PieceBuffer.__new__(PieceBuffer)
@@ -294,6 +297,13 @@ cdef class DirectPathLoadStreamMessage(Message):
         all_rows = manager._get_all_rows()
         arrays = manager._get_arrow_arrays()
 
+        # determine if character set conversion is needed; direct path load
+        # writes data directly to disk, bypassing the server's character set
+        # conversion, so data must be encoded in the database character set
+        db_charset_encoding = _get_db_charset_encoding(
+            self.conn_impl._protocol._caps.charset_id
+        )
+
         # calculate pieces
         for row_num in range(manager.num_rows):
             overall_row_num = manager.offset + row_num
@@ -311,6 +321,25 @@ cdef class DirectPathLoadStreamMessage(Message):
                     col = convert_arrow_to_oracle_data(
                         metadata, &data, array_impl, <int64_t> overall_row_num
                     )
+
+                # for character data with CS_FORM_IMPLICIT, re-encode from
+                # UTF-8 to the database character set if needed
+                if db_charset_encoding is not None and not data.is_null:
+                    ora_type_num = metadata.dbtype._ora_type_num
+                    if ora_type_num in (ORA_TYPE_NUM_VARCHAR,
+                                        ORA_TYPE_NUM_CHAR,
+                                        ORA_TYPE_NUM_LONG) \
+                            and metadata.dbtype._csfrm == CS_FORM_IMPLICIT:
+                        temp_bytes = data.buffer.as_raw_bytes.ptr[
+                            :data.buffer.as_raw_bytes.num_bytes
+                        ].decode().encode(db_charset_encoding)
+                        cpython.PyBytes_AsStringAndSize(
+                            temp_bytes,
+                            <char**> &data.buffer.as_raw_bytes.ptr,
+                            &data.buffer.as_raw_bytes.num_bytes
+                        )
+                        col = temp_bytes
+
                 buf.add_column_value(self.conn_impl, metadata, &data, col,
                                      self.current_row_num)
             buf.finish_row()
diff --git a/src/oracledb/impl/thin/messages/direct_path_prepare.pyx b/src/oracledb/impl/thin/messages/direct_path_prepare.pyx
index fe5502e4..4e8f0413 100644
--- a/src/oracledb/impl/thin/messages/direct_path_prepare.pyx
+++ b/src/oracledb/impl/thin/messages/direct_path_prepare.pyx
@@ -66,7 +66,7 @@ cdef class DirectPathPrepareMessage(Message):
         metadata = Message._process_metadata(self, buf)
         if metadata.dbtype._ora_type_num == ORA_TYPE_NUM_CLOB:
             metadata.dbtype = DbType._from_ora_type_and_csfrm(
-                ORA_TYPE_NUM_LONG, CS_FORM_NCHAR
+                ORA_TYPE_NUM_LONG, metadata.dbtype._csfrm
             )
         elif metadata.dbtype._ora_type_num == ORA_TYPE_NUM_BLOB:
             metadata.dbtype = DbType._from_ora_type_and_csfrm(
diff --git a/src/oracledb/impl/thin/utils.pyx b/src/oracledb/impl/thin/utils.pyx
index 0695d542..385f2a90 100644
--- a/src/oracledb/impl/thin/utils.pyx
+++ b/src/oracledb/impl/thin/utils.pyx
@@ -100,6 +100,83 @@ cdef int _check_cryptography() except -1:
         errors._raise_err(errors.ERR_NO_CRYPTOGRAPHY_PACKAGE,
                           str(CRYPTOGRAPHY_IMPORT_ERROR))
 
+# Mapping of Oracle character set IDs to Python encoding names. Used by
+# direct path loading to encode character data in the database character set.
+# For normal SQL execution the server performs the conversion, but for direct
+# path loading the data is written directly to disk.
+cdef dict ORACLE_CHARSET_TO_PYTHON_ENCODING = {
+
+    # ASCII
+    1: "ascii",              # US7ASCII
+
+    # ISO 8859 series
+    31: "iso-8859-1",        # WE8ISO8859P1
+    32: "iso-8859-2",        # EE8ISO8859P2
+    33: "iso-8859-3",        # SE8ISO8859P3
+    34: "iso-8859-4",        # NEE8ISO8859P4
+    35: "iso-8859-5",        # CL8ISO8859P5
+    36: "iso-8859-6",        # AR8ISO8859P6
+    37: "iso-8859-7",        # EL8ISO8859P7
+    38: "iso-8859-8",        # IW8ISO8859P8
+    39: "iso-8859-9",        # WE8ISO8859P9
+    40: "iso-8859-10",       # NE8ISO8859P10
+    41: "tis-620",           # TH8TISASCII
+    46: "iso-8859-15",       # WE8ISO8859P15
+    47: "iso-8859-13",       # BLT8ISO8859P13
+
+    # Windows code pages
+    170: "cp1250",           # EE8MSWIN1250
+    171: "cp1251",           # CL8MSWIN1251
+    172: "cp1253",           # EL8MSWIN1253
+    173: "cp1254",           # TR8MSWIN1254
+    174: "cp1255",           # IW8MSWIN1255
+    175: "cp1256",           # AR8MSWIN1256
+    176: "cp1257",           # BLT8MSWIN1257
+    177: "cp1258",           # VN8MSWIN1258
+    178: "cp1252",           # WE8MSWIN1252
+
+    # DOS / PC code pages
+    351: "cp850",            # WE8PC850
+    354: "cp437",            # US8PC437
+    368: "cp866",            # RU8PC866
+    382: "cp852",            # EE8PC852
+
+    # East Asian multi-byte
+    829: "big5",             # ZHT16BIG5
+    830: "euc_kr",           # KO16KSC5601
+    831: "euc_jp",           # JA16EUC
+    832: "cp932",            # JA16SJIS
+    833: "cp932",            # JA16SJISTILDE
+    834: "euc_jp",           # JA16EUCTILDE
+    846: "gbk",              # ZHS16GBK
+    850: "big5hkscs",        # ZHT16HKSCS
+    852: "euc_kr",           # KO16MSWIN949
+    854: "big5",             # ZHT16MSWIN950
+    870: "gb18030",          # ZHS32GB18030
+
+    # Unicode
+    871: "utf-8",            # UTF8 (CESU-8)
+    873: "utf-8",            # AL32UTF8
+    2000: "utf-16-be",       # AL16UTF16
+}
+
+
+cdef str _get_db_charset_encoding(uint16_t charset_id):
+    """
+    Returns the Python encoding name for the given Oracle character set ID,
+    or None if the character set is UTF-8 (no conversion needed).
+    """
+    cdef str encoding
+    if charset_id == TNS_CHARSET_UTF8:
+        return None
+    encoding = ORACLE_CHARSET_TO_PYTHON_ENCODING.get(charset_id)
+    if encoding is None:
+        errors._raise_err(errors.ERR_DB_CS_NOT_SUPPORTED,
+                          charset_id=charset_id)
+    if encoding == "utf-8":
+        return None
+    return encoding
+
 
 def init_thin_impl(package):
     """
diff --git a/tests/test_9600_direct_path_load.py b/tests/test_9600_direct_path_load.py
index c437b5f9..65a077e1 100644
--- a/tests/test_9600_direct_path_load.py
+++ b/tests/test_9600_direct_path_load.py
@@ -612,3 +612,42 @@ def test_9621(empty_tab, conn, test_env):
             data=df,
         )
     _verify_data_frame(conn, df, column_names, test_env)
+
+def test_9622(empty_tab, disable_fetch_lobs, conn, test_env):
+    "9622 - test direct path load with non-ASCII characters"
+    column_names = ["Id", "FirstName", "City", "LongData"]
+    data = [
+        (1, "Café", "Zürich", "Ñoño résumé"),
+        (2, "naïve", "São Paulo", "El niño está aquí"),
+        (3, "Ärger", "Malmö", "Ça fait déjà vu"),
+    ]
+    conn.direct_path_load(
+        schema_name=test_env.main_user,
+        table_name=TABLE_NAME,
+        column_names=column_names,
+        data=data,
+    )
+    _verify_data(conn, data, column_names)
+
+
+def test_9623(empty_tab, disable_fetch_lobs, conn, test_env):
+    "9623 - test direct path load with non-ASCII characters using data frame"
+    column_names = ["Id", "FirstName", "City", "LongData"]
+    data = {
+        "Id": [1, 2, 3],
+        "FirstName": ["Café", "naïve", "Ärger"],
+        "City": ["Zürich", "São Paulo", "Malmö"],
+        "LongData": [
+            "Ñoño résumé",
+            "El niño está aquí",
+            "Ça fait déjà vu",
+        ],
+    }
+    df = pandas.DataFrame(data)
+    conn.direct_path_load(
+        schema_name=test_env.main_user,
+        table_name=TABLE_NAME,
+        column_names=column_names,
+        data=df,
+    )
+    _verify_data_frame(conn, df, column_names, test_env)
diff --git a/tests/test_9700_direct_path_load_async.py b/tests/test_9700_direct_path_load_async.py
index 812c455d..6dad285e 100644
--- a/tests/test_9700_direct_path_load_async.py
+++ b/tests/test_9700_direct_path_load_async.py
@@ -616,3 +616,44 @@ async def test_9721(empty_tab, async_conn, test_env):
             data=df,
         )
     await _verify_data_frame(async_conn, df, column_names, test_env)
+
+async def test_9722(empty_tab, disable_fetch_lobs, async_conn, test_env):
+    "9722 - test direct path load with non-ASCII characters"
+    column_names = ["Id", "FirstName", "City", "LongData"]
+    data = [
+        (1, "Café", "Zürich", "Ñoño résumé"),
+        (2, "naïve", "São Paulo", "El niño está aquí"),
+        (3, "Ärger", "Malmö", "Ça fait déjà vu"),
+    ]
+    async with test_env.get_connection_async() as other_conn:
+        await other_conn.direct_path_load(
+            schema_name=test_env.main_user,
+            table_name=TABLE_NAME,
+            column_names=column_names,
+            data=data,
+        )
+    await _verify_data(async_conn, data, column_names)
+
+
+async def test_9723(empty_tab, disable_fetch_lobs, async_conn, test_env):
+    "9723 - test direct path load with non-ASCII characters using data frame"
+    column_names = ["Id", "FirstName", "City", "LongData"]
+    data = {
+        "Id": [1, 2, 3],
+        "FirstName": ["Café", "naïve", "Ärger"],
+        "City": ["Zürich", "São Paulo", "Malmö"],
+        "LongData": [
+            "Ñoño résumé",
+            "El niño está aquí",
+            "Ça fait déjà vu",
+        ],
+    }
+    df = pandas.DataFrame(data)
+    async with test_env.get_connection_async() as other_conn:
+        await other_conn.direct_path_load(
+            schema_name=test_env.main_user,
+            table_name=TABLE_NAME,
+            column_names=column_names,
+            data=df,
+        )
+    await _verify_data_frame(async_conn, df, column_names, test_env)