Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ tests/ext/config.ini
samples/sample.csv
samples/sample.parquet
samples/notebooks/testwrite.csv
*.so
5 changes: 5 additions & 0 deletions src/oracledb/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ def _raise_not_supported(feature: str) -> None:
ERR_UNSUPPORTED_INBAND_NOTIFICATION = 3008
ERR_SELF_BIND_NOT_SUPPORTED = 3009
ERR_SERVER_VERSION_NOT_SUPPORTED = 3010
ERR_DB_CS_NOT_SUPPORTED = 3011
ERR_NCHAR_CS_NOT_SUPPORTED = 3012
ERR_UNSUPPORTED_PYTHON_TYPE_FOR_DB_TYPE = 3013
ERR_LOB_OF_WRONG_TYPE = 3014
Expand Down Expand Up @@ -821,6 +822,10 @@ def _raise_not_supported(feature: str) -> None:
ERR_NAMED_TIMEZONE_NOT_SUPPORTED: (
"named time zones are not supported in thin mode"
),
ERR_DB_CS_NOT_SUPPORTED: (
"database character set id {charset_id} is not supported by "
"python-oracledb for direct path loading in thin mode"
),
ERR_NCHAR_CS_NOT_SUPPORTED: (
"national character set id {charset_id} is not supported by "
"python-oracledb in thin mode"
Expand Down
29 changes: 29 additions & 0 deletions src/oracledb/impl/thin/messages/direct_path_load_stream.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,9 @@ cdef class DirectPathLoadStreamMessage(Message):
PieceBuffer buf
OracleData data
ssize_t col_num
uint8_t ora_type_num
str db_charset_encoding
bytes temp_bytes

# create buffer used for writing column data
buf = PieceBuffer.__new__(PieceBuffer)
Expand All @@ -294,6 +297,13 @@ cdef class DirectPathLoadStreamMessage(Message):
all_rows = manager._get_all_rows()
arrays = manager._get_arrow_arrays()

# determine if character set conversion is needed; direct path load
# writes data directly to disk, bypassing the server's character set
# conversion, so data must be encoded in the database character set
db_charset_encoding = _get_db_charset_encoding(
self.conn_impl._protocol._caps.charset_id
)

# calculate pieces
for row_num in range(manager.num_rows):
overall_row_num = manager.offset + row_num
Expand All @@ -311,6 +321,25 @@ cdef class DirectPathLoadStreamMessage(Message):
col = convert_arrow_to_oracle_data(
metadata, &data, array_impl, <int64_t> overall_row_num
)

# for character data with CS_FORM_IMPLICIT, re-encode from
# UTF-8 to the database character set if needed
if db_charset_encoding is not None and not data.is_null:
ora_type_num = metadata.dbtype._ora_type_num
if ora_type_num in (ORA_TYPE_NUM_VARCHAR,
ORA_TYPE_NUM_CHAR,
ORA_TYPE_NUM_LONG) \
and metadata.dbtype._csfrm == CS_FORM_IMPLICIT:
temp_bytes = data.buffer.as_raw_bytes.ptr[
:data.buffer.as_raw_bytes.num_bytes
].decode().encode(db_charset_encoding)
cpython.PyBytes_AsStringAndSize(
temp_bytes,
<char**> &data.buffer.as_raw_bytes.ptr,
&data.buffer.as_raw_bytes.num_bytes
)
col = temp_bytes

buf.add_column_value(self.conn_impl, metadata, &data, col,
self.current_row_num)
buf.finish_row()
Expand Down
2 changes: 1 addition & 1 deletion src/oracledb/impl/thin/messages/direct_path_prepare.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ cdef class DirectPathPrepareMessage(Message):
metadata = Message._process_metadata(self, buf)
if metadata.dbtype._ora_type_num == ORA_TYPE_NUM_CLOB:
metadata.dbtype = DbType._from_ora_type_and_csfrm(
ORA_TYPE_NUM_LONG, CS_FORM_NCHAR
ORA_TYPE_NUM_LONG, metadata.dbtype._csfrm
)
elif metadata.dbtype._ora_type_num == ORA_TYPE_NUM_BLOB:
metadata.dbtype = DbType._from_ora_type_and_csfrm(
Expand Down
77 changes: 77 additions & 0 deletions src/oracledb/impl/thin/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,83 @@ cdef int _check_cryptography() except -1:
errors._raise_err(errors.ERR_NO_CRYPTOGRAPHY_PACKAGE,
str(CRYPTOGRAPHY_IMPORT_ERROR))

# Mapping of Oracle character set IDs to Python encoding names. Used by
# direct path loading to encode character data in the database character set.
# For normal SQL execution the server performs the conversion, but for direct
# path loading the data is written directly to disk.
cdef dict ORACLE_CHARSET_TO_PYTHON_ENCODING = {

# ASCII
1: "ascii", # US7ASCII

# ISO 8859 series
31: "iso-8859-1", # WE8ISO8859P1
32: "iso-8859-2", # EE8ISO8859P2
33: "iso-8859-3", # SE8ISO8859P3
34: "iso-8859-4", # NEE8ISO8859P4
35: "iso-8859-5", # CL8ISO8859P5
36: "iso-8859-6", # AR8ISO8859P6
37: "iso-8859-7", # EL8ISO8859P7
38: "iso-8859-8", # IW8ISO8859P8
39: "iso-8859-9", # WE8ISO8859P9
40: "iso-8859-10", # NE8ISO8859P10
41: "tis-620", # TH8TISASCII
46: "iso-8859-15", # WE8ISO8859P15
47: "iso-8859-13", # BLT8ISO8859P13

# Windows code pages
170: "cp1250", # EE8MSWIN1250
171: "cp1251", # CL8MSWIN1251
172: "cp1253", # EL8MSWIN1253
173: "cp1254", # TR8MSWIN1254
174: "cp1255", # IW8MSWIN1255
175: "cp1256", # AR8MSWIN1256
176: "cp1257", # BLT8MSWIN1257
177: "cp1258", # VN8MSWIN1258
178: "cp1252", # WE8MSWIN1252

# DOS / PC code pages
351: "cp850", # WE8PC850
354: "cp437", # US8PC437
368: "cp866", # RU8PC866
382: "cp852", # EE8PC852

# East Asian multi-byte
829: "big5", # ZHT16BIG5
830: "euc_kr", # KO16KSC5601
831: "euc_jp", # JA16EUC
832: "cp932", # JA16SJIS
833: "cp932", # JA16SJISTILDE
834: "euc_jp", # JA16EUCTILDE
846: "gbk", # ZHS16GBK
850: "big5hkscs", # ZHT16HKSCS
852: "euc_kr", # KO16MSWIN949
854: "big5", # ZHT16MSWIN950
870: "gb18030", # ZHS32GB18030

# Unicode
871: "utf-8", # UTF8 (CESU-8)
873: "utf-8", # AL32UTF8
2000: "utf-16-be", # AL16UTF16
}


cdef str _get_db_charset_encoding(uint16_t charset_id):
"""
Returns the Python encoding name for the given Oracle character set ID,
or None if the character set is UTF-8 (no conversion needed).
"""
cdef str encoding
if charset_id == TNS_CHARSET_UTF8:
return None
encoding = ORACLE_CHARSET_TO_PYTHON_ENCODING.get(charset_id)
if encoding is None:
errors._raise_err(errors.ERR_DB_CS_NOT_SUPPORTED,
charset_id=charset_id)
if encoding == "utf-8":
return None
return encoding


def init_thin_impl(package):
"""
Expand Down
39 changes: 39 additions & 0 deletions tests/test_9600_direct_path_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,3 +612,42 @@ def test_9621(empty_tab, conn, test_env):
data=df,
)
_verify_data_frame(conn, df, column_names, test_env)

def test_9622(empty_tab, disable_fetch_lobs, conn, test_env):
"9622 - test direct path load with non-ASCII characters"
column_names = ["Id", "FirstName", "City", "LongData"]
data = [
(1, "Café", "Zürich", "Ñoño résumé"),
(2, "naïve", "São Paulo", "El niño está aquí"),
(3, "Ärger", "Malmö", "Ça fait déjà vu"),
]
conn.direct_path_load(
schema_name=test_env.main_user,
table_name=TABLE_NAME,
column_names=column_names,
data=data,
)
_verify_data(conn, data, column_names)


def test_9623(empty_tab, disable_fetch_lobs, conn, test_env):
"9623 - test direct path load with non-ASCII characters using data frame"
column_names = ["Id", "FirstName", "City", "LongData"]
data = {
"Id": [1, 2, 3],
"FirstName": ["Café", "naïve", "Ärger"],
"City": ["Zürich", "São Paulo", "Malmö"],
"LongData": [
"Ñoño résumé",
"El niño está aquí",
"Ça fait déjà vu",
],
}
df = pandas.DataFrame(data)
conn.direct_path_load(
schema_name=test_env.main_user,
table_name=TABLE_NAME,
column_names=column_names,
data=df,
)
_verify_data_frame(conn, df, column_names, test_env)
41 changes: 41 additions & 0 deletions tests/test_9700_direct_path_load_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,3 +616,44 @@ async def test_9721(empty_tab, async_conn, test_env):
data=df,
)
await _verify_data_frame(async_conn, df, column_names, test_env)

async def test_9722(empty_tab, disable_fetch_lobs, async_conn, test_env):
"9722 - test direct path load with non-ASCII characters"
column_names = ["Id", "FirstName", "City", "LongData"]
data = [
(1, "Café", "Zürich", "Ñoño résumé"),
(2, "naïve", "São Paulo", "El niño está aquí"),
(3, "Ärger", "Malmö", "Ça fait déjà vu"),
]
async with test_env.get_connection_async() as other_conn:
await other_conn.direct_path_load(
schema_name=test_env.main_user,
table_name=TABLE_NAME,
column_names=column_names,
data=data,
)
await _verify_data(async_conn, data, column_names)


async def test_9723(empty_tab, disable_fetch_lobs, async_conn, test_env):
"9723 - test direct path load with non-ASCII characters using data frame"
column_names = ["Id", "FirstName", "City", "LongData"]
data = {
"Id": [1, 2, 3],
"FirstName": ["Café", "naïve", "Ärger"],
"City": ["Zürich", "São Paulo", "Malmö"],
"LongData": [
"Ñoño résumé",
"El niño está aquí",
"Ça fait déjà vu",
],
}
df = pandas.DataFrame(data)
async with test_env.get_connection_async() as other_conn:
await other_conn.direct_path_load(
schema_name=test_env.main_user,
table_name=TABLE_NAME,
column_names=column_names,
data=df,
)
await _verify_data_frame(async_conn, df, column_names, test_env)