From 9327bdff471e09a1c4d8c2eb1f603db3b5ab095a Mon Sep 17 00:00:00 2001 From: Chad Ongstad Date: Mon, 16 Feb 2026 17:47:54 +0000 Subject: [PATCH] Fixed bug causing character data corruption in direct path load on non-UTF-8 databases --- .gitignore | 1 + src/oracledb/errors.py | 5 ++ .../thin/messages/direct_path_load_stream.pyx | 29 +++++++ .../thin/messages/direct_path_prepare.pyx | 2 +- src/oracledb/impl/thin/utils.pyx | 77 +++++++++++++++++++ tests/test_9600_direct_path_load.py | 39 ++++++++++ tests/test_9700_direct_path_load_async.py | 41 ++++++++++ 7 files changed, 193 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1339436c..8cea107b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ tests/ext/config.ini samples/sample.csv samples/sample.parquet samples/notebooks/testwrite.csv +*.so diff --git a/src/oracledb/errors.py b/src/oracledb/errors.py index 44d406f7..9cee461e 100644 --- a/src/oracledb/errors.py +++ b/src/oracledb/errors.py @@ -306,6 +306,7 @@ def _raise_not_supported(feature: str) -> None: ERR_UNSUPPORTED_INBAND_NOTIFICATION = 3008 ERR_SELF_BIND_NOT_SUPPORTED = 3009 ERR_SERVER_VERSION_NOT_SUPPORTED = 3010 +ERR_DB_CS_NOT_SUPPORTED = 3011 ERR_NCHAR_CS_NOT_SUPPORTED = 3012 ERR_UNSUPPORTED_PYTHON_TYPE_FOR_DB_TYPE = 3013 ERR_LOB_OF_WRONG_TYPE = 3014 @@ -821,6 +822,10 @@ def _raise_not_supported(feature: str) -> None: ERR_NAMED_TIMEZONE_NOT_SUPPORTED: ( "named time zones are not supported in thin mode" ), + ERR_DB_CS_NOT_SUPPORTED: ( + "database character set id {charset_id} is not supported by " + "python-oracledb for direct path loading in thin mode" + ), ERR_NCHAR_CS_NOT_SUPPORTED: ( "national character set id {charset_id} is not supported by " "python-oracledb in thin mode" diff --git a/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx b/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx index d2a7fb60..daba79a7 100644 --- a/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx +++ b/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx @@ -285,6 +285,9 @@ cdef class DirectPathLoadStreamMessage(Message): PieceBuffer buf OracleData data ssize_t col_num + uint8_t ora_type_num + str db_charset_encoding + bytes temp_bytes # create buffer used for writing column data buf = PieceBuffer.__new__(PieceBuffer) @@ -294,6 +297,13 @@ cdef class DirectPathLoadStreamMessage(Message): all_rows = manager._get_all_rows() arrays = manager._get_arrow_arrays() + # determine if character set conversion is needed; direct path load + # writes data directly to disk, bypassing the server's character set + # conversion, so data must be encoded in the database character set + db_charset_encoding = _get_db_charset_encoding( + self.conn_impl._protocol._caps.charset_id + ) + # calculate pieces for row_num in range(manager.num_rows): overall_row_num = manager.offset + row_num @@ -311,6 +321,25 @@ cdef class DirectPathLoadStreamMessage(Message): col = convert_arrow_to_oracle_data( metadata, &data, array_impl, overall_row_num ) + + # for character data with CS_FORM_IMPLICIT, re-encode from + # UTF-8 to the database character set if needed + if db_charset_encoding is not None and not data.is_null: + ora_type_num = metadata.dbtype._ora_type_num + if ora_type_num in (ORA_TYPE_NUM_VARCHAR, + ORA_TYPE_NUM_CHAR, + ORA_TYPE_NUM_LONG) \ + and metadata.dbtype._csfrm == CS_FORM_IMPLICIT: + temp_bytes = data.buffer.as_raw_bytes.ptr[ + :data.buffer.as_raw_bytes.num_bytes + ].decode().encode(db_charset_encoding) + cpython.PyBytes_AsStringAndSize( + temp_bytes, + &data.buffer.as_raw_bytes.ptr, + &data.buffer.as_raw_bytes.num_bytes + ) + col = temp_bytes + buf.add_column_value(self.conn_impl, metadata, &data, col, self.current_row_num) buf.finish_row() diff --git a/src/oracledb/impl/thin/messages/direct_path_prepare.pyx b/src/oracledb/impl/thin/messages/direct_path_prepare.pyx index fe5502e4..4e8f0413 100644 --- a/src/oracledb/impl/thin/messages/direct_path_prepare.pyx +++ b/src/oracledb/impl/thin/messages/direct_path_prepare.pyx @@ -66,7 +66,7 @@ cdef class DirectPathPrepareMessage(Message): metadata = Message._process_metadata(self, buf) if metadata.dbtype._ora_type_num == ORA_TYPE_NUM_CLOB: metadata.dbtype = DbType._from_ora_type_and_csfrm( - ORA_TYPE_NUM_LONG, CS_FORM_NCHAR + ORA_TYPE_NUM_LONG, metadata.dbtype._csfrm ) elif metadata.dbtype._ora_type_num == ORA_TYPE_NUM_BLOB: metadata.dbtype = DbType._from_ora_type_and_csfrm( diff --git a/src/oracledb/impl/thin/utils.pyx b/src/oracledb/impl/thin/utils.pyx index 0695d542..385f2a90 100644 --- a/src/oracledb/impl/thin/utils.pyx +++ b/src/oracledb/impl/thin/utils.pyx @@ -100,6 +100,83 @@ cdef int _check_cryptography() except -1: errors._raise_err(errors.ERR_NO_CRYPTOGRAPHY_PACKAGE, str(CRYPTOGRAPHY_IMPORT_ERROR)) +# Mapping of Oracle character set IDs to Python encoding names. Used by +# direct path loading to encode character data in the database character set. +# For normal SQL execution the server performs the conversion, but for direct +# path loading the data is written directly to disk. +cdef dict ORACLE_CHARSET_TO_PYTHON_ENCODING = { + + # ASCII + 1: "ascii", # US7ASCII + + # ISO 8859 series + 31: "iso-8859-1", # WE8ISO8859P1 + 32: "iso-8859-2", # EE8ISO8859P2 + 33: "iso-8859-3", # SE8ISO8859P3 + 34: "iso-8859-4", # NEE8ISO8859P4 + 35: "iso-8859-5", # CL8ISO8859P5 + 36: "iso-8859-6", # AR8ISO8859P6 + 37: "iso-8859-7", # EL8ISO8859P7 + 38: "iso-8859-8", # IW8ISO8859P8 + 39: "iso-8859-9", # WE8ISO8859P9 + 40: "iso-8859-10", # NE8ISO8859P10 + 41: "tis-620", # TH8TISASCII + 46: "iso-8859-15", # WE8ISO8859P15 + 47: "iso-8859-13", # BLT8ISO8859P13 + + # Windows code pages + 170: "cp1250", # EE8MSWIN1250 + 171: "cp1251", # CL8MSWIN1251 + 172: "cp1253", # EL8MSWIN1253 + 173: "cp1254", # TR8MSWIN1254 + 174: "cp1255", # IW8MSWIN1255 + 175: "cp1256", # AR8MSWIN1256 + 176: "cp1257", # BLT8MSWIN1257 + 177: "cp1258", # VN8MSWIN1258 + 178: "cp1252", # WE8MSWIN1252 + + # DOS / PC code pages + 351: "cp850", # WE8PC850 + 354: "cp437", # US8PC437 + 368: "cp866", # RU8PC866 + 382: "cp852", # EE8PC852 + + # East Asian multi-byte + 829: "big5", # ZHT16BIG5 + 830: "euc_kr", # KO16KSC5601 + 831: "euc_jp", # JA16EUC + 832: "cp932", # JA16SJIS + 833: "cp932", # JA16SJISTILDE + 834: "euc_jp", # JA16EUCTILDE + 846: "gbk", # ZHS16GBK + 850: "big5hkscs", # ZHT16HKSCS + 852: "euc_kr", # KO16MSWIN949 + 854: "big5", # ZHT16MSWIN950 + 870: "gb18030", # ZHS32GB18030 + + # Unicode + 871: "utf-8", # UTF8 (CESU-8) + 873: "utf-8", # AL32UTF8 + 2000: "utf-16-be", # AL16UTF16 +} + + +cdef str _get_db_charset_encoding(uint16_t charset_id): + """ + Returns the Python encoding name for the given Oracle character set ID, + or None if the character set is UTF-8 (no conversion needed). + """ + cdef str encoding + if charset_id == TNS_CHARSET_UTF8: + return None + encoding = ORACLE_CHARSET_TO_PYTHON_ENCODING.get(charset_id) + if encoding is None: + errors._raise_err(errors.ERR_DB_CS_NOT_SUPPORTED, + charset_id=charset_id) + if encoding == "utf-8": + return None + return encoding + def init_thin_impl(package): """ diff --git a/tests/test_9600_direct_path_load.py b/tests/test_9600_direct_path_load.py index c437b5f9..65a077e1 100644 --- a/tests/test_9600_direct_path_load.py +++ b/tests/test_9600_direct_path_load.py @@ -612,3 +612,42 @@ def test_9621(empty_tab, conn, test_env): data=df, ) _verify_data_frame(conn, df, column_names, test_env) + +def test_9622(empty_tab, disable_fetch_lobs, conn, test_env): + "9622 - test direct path load with non-ASCII characters" + column_names = ["Id", "FirstName", "City", "LongData"] + data = [ + (1, "Café", "Zürich", "Ñoño résumé"), + (2, "naïve", "São Paulo", "El niño está aquí"), + (3, "Ärger", "Malmö", "Ça fait déjà vu"), + ] + conn.direct_path_load( + schema_name=test_env.main_user, + table_name=TABLE_NAME, + column_names=column_names, + data=data, + ) + _verify_data(conn, data, column_names) + + +def test_9623(empty_tab, disable_fetch_lobs, conn, test_env): + "9623 - test direct path load with non-ASCII characters using data frame" + column_names = ["Id", "FirstName", "City", "LongData"] + data = { + "Id": [1, 2, 3], + "FirstName": ["Café", "naïve", "Ärger"], + "City": ["Zürich", "São Paulo", "Malmö"], + "LongData": [ + "Ñoño résumé", + "El niño está aquí", + "Ça fait déjà vu", + ], + } + df = pandas.DataFrame(data) + conn.direct_path_load( + schema_name=test_env.main_user, + table_name=TABLE_NAME, + column_names=column_names, + data=df, + ) + _verify_data_frame(conn, df, column_names, test_env) diff --git a/tests/test_9700_direct_path_load_async.py b/tests/test_9700_direct_path_load_async.py index 812c455d..6dad285e 100644 --- a/tests/test_9700_direct_path_load_async.py +++ b/tests/test_9700_direct_path_load_async.py @@ -616,3 +616,44 @@ async def test_9721(empty_tab, async_conn, test_env): data=df, ) await _verify_data_frame(async_conn, df, column_names, test_env) + +async def test_9722(empty_tab, disable_fetch_lobs, async_conn, test_env): + "9722 - test direct path load with non-ASCII characters" + column_names = ["Id", "FirstName", "City", "LongData"] + data = [ + (1, "Café", "Zürich", "Ñoño résumé"), + (2, "naïve", "São Paulo", "El niño está aquí"), + (3, "Ärger", "Malmö", "Ça fait déjà vu"), + ] + async with test_env.get_connection_async() as other_conn: + await other_conn.direct_path_load( + schema_name=test_env.main_user, + table_name=TABLE_NAME, + column_names=column_names, + data=data, + ) + await _verify_data(async_conn, data, column_names) + + +async def test_9723(empty_tab, disable_fetch_lobs, async_conn, test_env): + "9723 - test direct path load with non-ASCII characters using data frame" + column_names = ["Id", "FirstName", "City", "LongData"] + data = { + "Id": [1, 2, 3], + "FirstName": ["Café", "naïve", "Ärger"], + "City": ["Zürich", "São Paulo", "Malmö"], + "LongData": [ + "Ñoño résumé", + "El niño está aquí", + "Ça fait déjà vu", + ], + } + df = pandas.DataFrame(data) + async with test_env.get_connection_async() as other_conn: + await other_conn.direct_path_load( + schema_name=test_env.main_user, + table_name=TABLE_NAME, + column_names=column_names, + data=df, + ) + await _verify_data_frame(async_conn, df, column_names, test_env)