diff --git a/cpp/cmake/modules/FindDepsArrowAdapter.cmake b/cpp/cmake/modules/FindDepsArrowAdapter.cmake index c9c797acd..da367b597 100644 --- a/cpp/cmake/modules/FindDepsArrowAdapter.cmake +++ b/cpp/cmake/modules/FindDepsArrowAdapter.cmake @@ -3,3 +3,25 @@ cmake_minimum_required(VERSION 3.7.2) # ARROW find_package(Arrow REQUIRED) include_directories(${ARROW_INCLUDE_DIR}) + +# Resolve Arrow link targets based on platform and vcpkg configuration. +# Sets CSP_ARROW_LINK_LIBS for use in target_link_libraries(). +# On Windows with vcpkg, also applies the ws2_32.dll fix and defines ARROW_STATIC. +if(WIN32) + if(CSP_USE_VCPKG) + set(CSP_ARROW_LINK_LIBS Arrow::arrow_static) + add_compile_definitions(ARROW_STATIC) + else() + # Until we manage to get the fix for ws2_32.dll in arrow-16 into conda, manually fix the error here + get_target_property(LINK_LIBS Arrow::arrow_shared INTERFACE_LINK_LIBRARIES) + string(REPLACE "ws2_32.dll" "ws2_32" FIXED_LINK_LIBS "${LINK_LIBS}") + set_target_properties(Arrow::arrow_shared PROPERTIES INTERFACE_LINK_LIBRARIES "${FIXED_LINK_LIBS}") + set(CSP_ARROW_LINK_LIBS arrow_shared) + endif() +else() + if(CSP_USE_VCPKG) + set(CSP_ARROW_LINK_LIBS arrow_static) + else() + set(CSP_ARROW_LINK_LIBS arrow) + endif() +endif() diff --git a/cpp/cmake/modules/FindDepsParquetAdapter.cmake b/cpp/cmake/modules/FindDepsParquetAdapter.cmake index d932139b2..f694b5ae3 100644 --- a/cpp/cmake/modules/FindDepsParquetAdapter.cmake +++ b/cpp/cmake/modules/FindDepsParquetAdapter.cmake @@ -1,13 +1,29 @@ cmake_minimum_required(VERSION 3.7.2) -# ARROW -find_package(Arrow REQUIRED) -include_directories(${ARROW_INCLUDE_DIR}) +# ARROW (reuse FindDepsArrowAdapter for find_package + link target resolution) +find_package(DepsArrowAdapter REQUIRED) # PARQUET find_package(Parquet REQUIRED) include_directories(${PARQUET_INCLUDE_DIR}) +# Resolve Parquet link targets based on platform and vcpkg configuration. +# Sets CSP_PARQUET_LINK_LIBS for use in target_link_libraries(). +if(WIN32) + if(CSP_USE_VCPKG) + set(CSP_PARQUET_LINK_LIBS Parquet::parquet_static) + add_compile_definitions(PARQUET_STATIC) + else() + set(CSP_PARQUET_LINK_LIBS parquet_shared) + endif() +else() + if(CSP_USE_VCPKG) + set(CSP_PARQUET_LINK_LIBS parquet_static) + else() + set(CSP_PARQUET_LINK_LIBS parquet) + endif() +endif() + # Other deps find_package(Thrift REQUIRED) find_package(Brotli REQUIRED) diff --git a/cpp/csp/adapters/CMakeLists.txt b/cpp/csp/adapters/CMakeLists.txt index 41a929dcf..67dc0025a 100644 --- a/cpp/csp/adapters/CMakeLists.txt +++ b/cpp/csp/adapters/CMakeLists.txt @@ -11,4 +11,5 @@ if(CSP_BUILD_WS_CLIENT_ADAPTER) add_subdirectory(websocket) endif() +add_subdirectory(arrow) add_subdirectory(utils) diff --git a/cpp/csp/adapters/arrow/ArrowFieldReader.cpp b/cpp/csp/adapters/arrow/ArrowFieldReader.cpp new file mode 100644 index 000000000..ac6a39519 --- /dev/null +++ b/cpp/csp/adapters/arrow/ArrowFieldReader.cpp @@ -0,0 +1,417 @@ +// Concrete FieldReader implementations for all scalar Arrow types. +// +// Most readers use LambdaReader — a single template that +// takes a read-one-row callable at construction. Only readers with +// extra state (EnumFromString, Dict*, NestedStruct) are separate classes. + +#include +#include +#include + +#include +#include +#include + +namespace csp::adapters::arrow +{ + +namespace +{ + +// Columnar bulk-read helper: dispatches fn(arr, row, struct*) for each row, +// skipping nulls when null_count > 0. +template +void readColumn( const ArrowArrayT & typed, std::vector & structs, int64_t numRows, Fn && fn ) +{ + if( typed.null_count() == 0 ) + for( int64_t i = 0; i < numRows; ++i ) + fn( typed, i, structs[i].get() ); + else + for( int64_t i = 0; i < numRows; ++i ) + if( typed.IsValid( i ) ) + fn( typed, i, structs[i].get() ); +} + +// Helper: compute nanosecond multiplier for a given arrow::TimeUnit +int64_t timeUnitMultiplier( ::arrow::TimeUnit::type unit ) +{ + switch( unit ) + { + case ::arrow::TimeUnit::SECOND: return csp::NANOS_PER_SECOND; + case ::arrow::TimeUnit::MILLI: return csp::NANOS_PER_MILLISECOND; + case ::arrow::TimeUnit::MICRO: return csp::NANOS_PER_MICROSECOND; + case ::arrow::TimeUnit::NANO: return 1LL; + } + CSP_THROW( TypeError, "Unexpected arrow TimeUnit: " << static_cast( unit ) ); +} + +// --- Generic lambda-based reader (covers Primitive, HalfFloat, StringLike, Nanos, Date) --- +// ReadFn signature: void(const ArrowArrayT &, int64_t row, Struct *) + +template +class LambdaReader final : public FieldReader +{ +public: + LambdaReader( const std::string & columnName, const StructFieldPtr & field, ReadFn fn ) + : FieldReader( columnName, field ), m_fn( std::move( fn ) ) {} + + void readAll( std::vector & structs, int64_t numRows ) override + { + readColumn( static_cast( *m_column ), structs, numRows, m_fn ); + m_row = numRows; + } + +protected: + void doReadNext( int64_t row, Struct * s ) override + { + auto & typed = static_cast( *m_column ); + if( typed.IsValid( row ) ) + m_fn( typed, row, s ); + } + +private: + ReadFn m_fn; +}; + +// Factory: creates a LambdaReader, deducing ReadFn type +template +std::unique_ptr makeReader( const std::string & name, const StructFieldPtr & field, ReadFn && fn ) +{ + return std::make_unique>>( name, field, std::forward( fn ) ); +} + +// Factory: primitive numeric reader (static_cast Value(i) to CspT) +template +std::unique_ptr makePrimitiveReader( const std::string & name, const StructFieldPtr & f ) +{ + return makeReader( name, f, [f]( auto & arr, int64_t i, Struct * s ) { + f -> setValue( s, static_cast( arr.Value( i ) ) ); + } ); +} + +// Factory: string/binary reader (GetView → std::string) +template +std::unique_ptr makeStringReader( const std::string & name, const StructFieldPtr & f ) +{ + return makeReader( name, f, [f]( auto & arr, int64_t i, Struct * s ) { + auto view = arr.GetView( i ); + f -> setValue( s, std::string( view.data(), view.size() ) ); + } ); +} + +// Factory: nanosecond-based temporal reader (Value * multiplier → CspT::fromNanoseconds) +template +std::unique_ptr makeNanosReader( const std::string & name, const StructFieldPtr & f, int64_t mult ) +{ + return makeReader( name, f, [f, mult]( auto & arr, int64_t i, Struct * s ) { + f -> setValue( s, CspT::fromNanoseconds( static_cast( arr.Value( i ) ) * mult ) ); + } ); +} + +// --- Enum from string column (needs m_enumMeta + m_tmpStr state) --- + +template +class EnumFromStringReader final : public FieldReader +{ +public: + EnumFromStringReader( const std::string & columnName, const StructFieldPtr & field ) + : FieldReader( columnName, field ), + m_enumMeta( std::static_pointer_cast( field -> type() ) -> meta() ) {} + + void readAll( std::vector & structs, int64_t numRows ) override + { + auto & typed = static_cast( *m_column ); + readColumn( typed, structs, numRows, [this]( auto & arr, int64_t i, Struct * s ) { + auto view = arr.GetView( i ); + m_tmpStr.assign( view.data(), view.size() ); + m_field -> setValue( s, m_enumMeta -> fromString( m_tmpStr.c_str() ) ); + } ); + m_row = numRows; + } + +protected: + void doReadNext( int64_t row, Struct * s ) override + { + auto & typed = static_cast( *m_column ); + if( typed.IsValid( row ) ) + { + auto view = typed.GetView( row ); + m_tmpStr.assign( view.data(), view.size() ); + m_field -> setValue( s, m_enumMeta -> fromString( m_tmpStr.c_str() ) ); + } + } + +private: + std::shared_ptr m_enumMeta; + mutable std::string m_tmpStr; +}; + +// --- Dictionary-encoded string --- + +class DictStringReader final : public FieldReader +{ +public: + using FieldReader::FieldReader; + + void readAll( std::vector & structs, int64_t numRows ) override + { + auto & typed = static_cast( *m_column ); + const auto * dict = &static_cast( *typed.dictionary() ); + readColumn( typed, structs, numRows, [this, dict]( auto & arr, int64_t i, Struct * s ) { + auto view = dict -> GetView( arr.GetValueIndex( i ) ); + m_field -> setValue( s, std::string( view.data(), view.size() ) ); + } ); + m_row = numRows; + } + +protected: + void doReadNext( int64_t row, Struct * s ) override + { + auto & typed = static_cast( *m_column ); + if( row == 0 ) + m_dict = &static_cast( *typed.dictionary() ); + if( typed.IsValid( row ) ) + { + auto view = m_dict -> GetView( typed.GetValueIndex( row ) ); + m_field -> setValue( s, std::string( view.data(), view.size() ) ); + } + } + +private: + const ::arrow::StringArray * m_dict = nullptr; +}; + +// --- Dictionary-encoded enum --- + +class DictEnumReader final : public FieldReader +{ +public: + DictEnumReader( const std::string & columnName, const StructFieldPtr & field ) + : FieldReader( columnName, field ), + m_enumMeta( std::static_pointer_cast( field -> type() ) -> meta() ) {} + + void readAll( std::vector & structs, int64_t numRows ) override + { + auto & typed = static_cast( *m_column ); + const auto * dict = &static_cast( *typed.dictionary() ); + readColumn( typed, structs, numRows, [this, dict]( auto & arr, int64_t i, Struct * s ) { + auto view = dict -> GetView( arr.GetValueIndex( i ) ); + m_tmpStr.assign( view.data(), view.size() ); + m_field -> setValue( s, m_enumMeta -> fromString( m_tmpStr.c_str() ) ); + } ); + m_row = numRows; + } + +protected: + void doReadNext( int64_t row, Struct * s ) override + { + auto & typed = static_cast( *m_column ); + if( row == 0 ) + m_dict = &static_cast( *typed.dictionary() ); + if( typed.IsValid( row ) ) + { + auto view = m_dict -> GetView( typed.GetValueIndex( row ) ); + m_tmpStr.assign( view.data(), view.size() ); + m_field -> setValue( s, m_enumMeta -> fromString( m_tmpStr.c_str() ) ); + } + } + +private: + std::shared_ptr m_enumMeta; + const ::arrow::StringArray * m_dict = nullptr; + mutable std::string m_tmpStr; +}; + +// --- Nested struct (recursive) --- + +class NestedStructReader final : public FieldReader +{ +public: + NestedStructReader( const std::string & columnName, const StructFieldPtr & field, + const std::shared_ptr<::arrow::DataType> & arrowType ) + : FieldReader( columnName, field ) + { + m_nestedMeta = std::static_pointer_cast( field -> type() ) -> meta(); + auto structType = std::static_pointer_cast<::arrow::StructType>( arrowType ); + + m_childReaders.reserve( structType -> num_fields() ); + for( int i = 0; i < structType -> num_fields(); ++i ) + { + auto childArrowField = structType -> field( i ); + auto childStructField = m_nestedMeta -> field( childArrowField -> name() ); + if( !childStructField ) + CSP_THROW( RuntimeException, "Nested arrow struct field '" << childArrowField -> name() + << "' not found on CSP struct type '" << m_nestedMeta -> name() << "'" ); + m_childIndices.push_back( i ); + m_childReaders.push_back( createFieldReader( childArrowField, childStructField ) ); + } + } + + void readAll( std::vector & structs, int64_t numRows ) override + { + auto & typed = static_cast( *m_column ); + for( size_t i = 0; i < m_childReaders.size(); ++i ) + m_childReaders[i] -> bindColumn( typed.field( m_childIndices[i] ).get() ); + + if( typed.null_count() == 0 ) + { + // Pre-allocate nested structs and let children use their columnar readAll paths + std::vector nested( numRows ); + for( int64_t i = 0; i < numRows; ++i ) + nested[i] = m_nestedMeta -> create(); + for( auto & child : m_childReaders ) + child -> readAll( nested, numRows ); + for( int64_t row = 0; row < numRows; ++row ) + m_field -> setValue( structs[row].get(), std::move( nested[row] ) ); + } + else + { + for( int64_t row = 0; row < numRows; ++row ) + { + if( typed.IsValid( row ) ) + { + StructPtr nested = m_nestedMeta -> create(); + for( auto & child : m_childReaders ) + child -> readNext( nested.get() ); + m_field -> setValue( structs[row].get(), std::move( nested ) ); + } + else + { + for( auto & child : m_childReaders ) + child -> skipNext(); + } + } + } + m_row = numRows; + } + +protected: + void doReadNext( int64_t row, Struct * s ) override + { + auto & typed = static_cast( *m_column ); + if( row == 0 ) + for( size_t i = 0; i < m_childReaders.size(); ++i ) + m_childReaders[i] -> bindColumn( typed.field( m_childIndices[i] ).get() ); + + if( typed.IsValid( row ) ) + { + StructPtr nested = m_nestedMeta -> create(); + for( auto & child : m_childReaders ) + child -> readNext( nested.get() ); + m_field -> setValue( s, std::move( nested ) ); + } + else + { + for( auto & child : m_childReaders ) + child -> skipNext(); + } + } + +private: + std::shared_ptr m_nestedMeta; + std::vector m_childIndices; + std::vector> m_childReaders; +}; + +} // anonymous namespace + +std::unique_ptr createFieldReader( + const std::shared_ptr<::arrow::Field> & arrowField, + const StructFieldPtr & structField ) +{ + bool isEnum = structField -> type() -> type() == CspType::Type::ENUM; + auto typeId = arrowField -> type() -> id(); + auto & name = arrowField -> name(); + auto & f = structField; + + switch( typeId ) + { + // --- Numeric --- + case ::arrow::Type::BOOL: return makePrimitiveReader( name, f ); + case ::arrow::Type::INT8: return makePrimitiveReader( name, f ); + case ::arrow::Type::INT16: return makePrimitiveReader( name, f ); + case ::arrow::Type::INT32: return makePrimitiveReader( name, f ); + case ::arrow::Type::INT64: return makePrimitiveReader( name, f ); + case ::arrow::Type::UINT8: return makePrimitiveReader( name, f ); + case ::arrow::Type::UINT16: return makePrimitiveReader( name, f ); + case ::arrow::Type::UINT32: return makePrimitiveReader( name, f ); + case ::arrow::Type::UINT64: return makePrimitiveReader( name, f ); + case ::arrow::Type::FLOAT: return makePrimitiveReader( name, f ); + case ::arrow::Type::DOUBLE: return makePrimitiveReader( name, f ); + + case ::arrow::Type::HALF_FLOAT: + return makeReader<::arrow::HalfFloatArray>( name, f, [f]( auto & arr, int64_t i, Struct * s ) { + f -> setValue( s, ::arrow::util::Float16::FromBits( arr.Value( i ) ).ToDouble() ); + } ); + + // --- String --- + case ::arrow::Type::STRING: + if( isEnum ) return std::make_unique>( name, f ); + return makeStringReader<::arrow::StringArray>( name, f ); + case ::arrow::Type::LARGE_STRING: + if( isEnum ) return std::make_unique>( name, f ); + return makeStringReader<::arrow::LargeStringArray>( name, f ); + + // --- Binary / bytes --- + case ::arrow::Type::BINARY: return makeStringReader<::arrow::BinaryArray>( name, f ); + case ::arrow::Type::LARGE_BINARY: return makeStringReader<::arrow::LargeBinaryArray>( name, f ); + case ::arrow::Type::FIXED_SIZE_BINARY: return makeStringReader<::arrow::FixedSizeBinaryArray>( name, f ); + + // --- Timestamp -> DateTime --- + case ::arrow::Type::TIMESTAMP: + { + auto mult = timeUnitMultiplier( std::static_pointer_cast<::arrow::TimestampType>( arrowField -> type() ) -> unit() ); + return makeNanosReader( name, f, mult ); + } + + // --- Duration -> TimeDelta --- + case ::arrow::Type::DURATION: + { + auto mult = timeUnitMultiplier( std::static_pointer_cast<::arrow::DurationType>( arrowField -> type() ) -> unit() ); + return makeNanosReader( name, f, mult ); + } + + // --- Date --- + case ::arrow::Type::DATE32: + return makeReader<::arrow::Date32Array>( name, f, [f]( auto & arr, int64_t i, Struct * s ) { + f -> setValue( s, DateTime::fromNanoseconds( static_cast( arr.Value( i ) ) * csp::NANOS_PER_DAY ).date() ); + } ); + case ::arrow::Type::DATE64: + return makeReader<::arrow::Date64Array>( name, f, [f]( auto & arr, int64_t i, Struct * s ) { + f -> setValue( s, DateTime::fromNanoseconds( arr.Value( i ) * csp::NANOS_PER_MILLISECOND ).date() ); + } ); + + // --- Time --- + case ::arrow::Type::TIME32: + { + auto mult = timeUnitMultiplier( std::static_pointer_cast<::arrow::Time32Type>( arrowField -> type() ) -> unit() ); + return makeNanosReader( name, f, mult ); + } + case ::arrow::Type::TIME64: + { + auto mult = timeUnitMultiplier( std::static_pointer_cast<::arrow::Time64Type>( arrowField -> type() ) -> unit() ); + return makeNanosReader( name, f, mult ); + } + + // --- Dictionary-encoded --- + case ::arrow::Type::DICTIONARY: + { + auto dictType = std::static_pointer_cast<::arrow::DictionaryType>( arrowField -> type() ); + if( dictType -> value_type() -> id() != ::arrow::Type::STRING ) + CSP_THROW( TypeError, "Unsupported dictionary value type " << dictType -> value_type() -> ToString() + << " for column '" << name << "'; only string dictionaries supported" ); + if( isEnum ) return std::make_unique( name, f ); + return std::make_unique( name, f ); + } + + // --- Nested struct --- + case ::arrow::Type::STRUCT: + return std::make_unique( name, f, arrowField -> type() ); + + default: + CSP_THROW( TypeError, "Unsupported arrow type " << arrowField -> type() -> ToString() + << " for column '" << name << "'" ); + } +} + +} diff --git a/cpp/csp/adapters/arrow/ArrowFieldReader.h b/cpp/csp/adapters/arrow/ArrowFieldReader.h new file mode 100644 index 000000000..953bec66b --- /dev/null +++ b/cpp/csp/adapters/arrow/ArrowFieldReader.h @@ -0,0 +1,94 @@ +// Per-column readers that extract values from Arrow arrays into CSP struct fields. +// +// FieldReader is the base class with non-virtual bindColumn()/readNext() for +// sequential row processing. Every reader targets exactly one struct field. +// Scalar readers use the single-column constructor; multi-column readers +// (e.g. NDArray with data + dims) use the multi-column constructor and +// override the virtual bindBatch(). + +#ifndef _IN_CSP_ADAPTERS_ARROW_ArrowFieldReader_H +#define _IN_CSP_ADAPTERS_ARROW_ArrowFieldReader_H + +#include +#include +#include +#include +#include +#include + +namespace csp::adapters::arrow +{ + +class FieldReader +{ +public: + virtual ~FieldReader() = default; + + // Constructor for single-column readers + FieldReader( const std::string & columnName, const StructFieldPtr & field ) + : m_field( field ), m_columnNames( { columnName } ) + { + } + + // Constructor for multi-column readers (e.g. NDArray with data + dims columns) + FieldReader( std::vector columnNames, const StructFieldPtr & field ) + : m_field( field ), m_columnNames( std::move( columnNames ) ) + { + } + + // Set primary column pointer and reset row counter. Non-virtual. + void bindColumn( const ::arrow::Array * column ) + { + m_column = column; + m_row = 0; + } + + // Batch-level bind for readers that need the full RecordBatch (e.g. multi-column + // numpy readers). Default does nothing; custom readers override this. + virtual void bindBatch( const ::arrow::RecordBatch & batch ) {} + + // Read the current row into the struct and advance to the next row. + void readNext( Struct * s ) + { + doReadNext( m_row, s ); + ++m_row; + } + + // Advance the row counter without reading (used to keep child readers in sync + // when a parent nested struct row is null). + void skipNext() + { + ++m_row; + } + + // Columnar bulk-read: read all rows for this column into pre-allocated structs. + // Default implementation loops over doReadNext(); concrete readers override + // with a null_count==0 fast path to skip per-row validity checks. + virtual void readAll( std::vector & structs, int64_t numRows ) + { + for( int64_t row = 0; row < numRows; ++row ) + doReadNext( row, structs[row].get() ); + m_row = numRows; + } + + // Column names consumed by this reader. + const std::vector & columnNames() const { return m_columnNames; } + +protected: + virtual void doReadNext( int64_t row, Struct * s ) = 0; + + StructFieldPtr m_field; + std::vector m_columnNames; + const ::arrow::Array * m_column = nullptr; + int64_t m_row = 0; +}; + +// Factory: create a scalar FieldReader for a given Arrow field + CSP struct field. +std::unique_ptr createFieldReader( + const std::shared_ptr<::arrow::Field> & arrowField, + const StructFieldPtr & structField +); + +} + +#endif diff --git a/cpp/csp/adapters/arrow/ArrowFieldWriter.cpp b/cpp/csp/adapters/arrow/ArrowFieldWriter.cpp new file mode 100644 index 000000000..0eacd8288 --- /dev/null +++ b/cpp/csp/adapters/arrow/ArrowFieldWriter.cpp @@ -0,0 +1,374 @@ +// Concrete FieldWriter implementations for all CSP scalar types. +// +// Fixed-length writers use UnsafeWriter — a single template +// that takes a value-extraction callable at construction (mirrors LambdaReader). +// Variable-length writers (StringLike, Enum) and NestedStruct are separate +// classes because they need safe Append (variable-length) or recursive logic. + +#include +#include +#include + +#include +#include + +namespace csp::adapters::arrow +{ + +#define ARROW_OK_OR_THROW( expr, msg ) \ + do { auto __s = ( expr ); if( !__s.ok() ) CSP_THROW( RuntimeException, msg << ": " << __s.ToString() ); } while(0) + +// --- Base class default implementations --- + +void FieldWriter::reserve( int64_t numRows ) +{ + ARROW_OK_OR_THROW( m_builder -> Reserve( numRows ), "Failed to reserve builder capacity" ); +} + +void FieldWriter::writeNext( const Struct * s ) +{ + if( m_field -> isSet( s ) ) + doWrite( s ); + else + writeNull(); +} + +void FieldWriter::writeAll( const std::vector & structs, int64_t offset, int64_t count ) +{ + for( int64_t i = offset; i < offset + count; ++i ) + writeNext( structs[i].get() ); +} + +void FieldWriter::writeNull() +{ + ARROW_OK_OR_THROW( m_builder -> AppendNull(), "Failed to append null" ); +} + +std::vector> FieldWriter::finish() +{ + std::shared_ptr<::arrow::Array> arr; + ARROW_OK_OR_THROW( m_builder -> Finish( &arr ), "Failed to finish array" ); + return { arr }; +} + +namespace +{ + +// --- Generic lambda-based writer for fixed-length types --- +// ValueFn signature: auto(const Struct *) — returns the value to UnsafeAppend/Append. +// Covers: all numeric primitives, bool, DateTime, TimeDelta, Time, Date. + +template +class UnsafeWriter final : public FieldWriter +{ +public: + UnsafeWriter( const std::string & columnName, const StructFieldPtr & field, + std::shared_ptr typedBuilder, + std::shared_ptr<::arrow::DataType> dataType, ValueFn fn ) + : FieldWriter( columnName, field, typedBuilder, std::move( dataType ) ), + m_typedBuilder( typedBuilder.get() ), m_fn( std::move( fn ) ) {} + + void writeAll( const std::vector & structs, int64_t offset, int64_t count ) override + { + for( int64_t i = offset; i < offset + count; ++i ) + { + const Struct * s = structs[i].get(); + if( m_field -> isSet( s ) ) + m_typedBuilder -> UnsafeAppend( m_fn( s ) ); + else + m_typedBuilder -> UnsafeAppendNull(); + } + } + +protected: + void doWrite( const Struct * s ) override + { + ARROW_OK_OR_THROW( m_typedBuilder -> Append( m_fn( s ) ), "Failed to append value" ); + } + +private: + ArrowBuilderT * m_typedBuilder; + ValueFn m_fn; +}; + +// Factory: creates an UnsafeWriter, deducing ValueFn type, and returns CreatedFieldWriter +template +CreatedFieldWriter makeUnsafeWriter( const std::string & name, const StructFieldPtr & field, + std::shared_ptr builder, + std::shared_ptr<::arrow::DataType> dataType, ValueFn && fn ) +{ + auto w = std::make_unique>>( + name, field, builder, std::move( dataType ), std::forward( fn ) ); + return { std::move( w ), std::move( builder ) }; +} + +// Factory: primitive numeric writer (auto-creates builder from default constructor) +template +CreatedFieldWriter makePrimitiveWriter( const std::string & name, const StructFieldPtr & f ) +{ + auto b = std::make_shared(); + return makeUnsafeWriter( name, f, b, b -> type(), [f]( const Struct * s ) { + return static_cast( f -> value( s ) ); + } ); +} + +// Factory: nanosecond-based temporal writer (DateTime, TimeDelta, Time) +template +CreatedFieldWriter makeNanosWriter( const std::string & name, const StructFieldPtr & f, + std::shared_ptr<::arrow::DataType> dataType ) +{ + auto b = std::make_shared( dataType, ::arrow::default_memory_pool() ); + return makeUnsafeWriter( name, f, b, std::move( dataType ), [f]( const Struct * s ) { + return f -> value( s ).asNanoseconds(); + } ); +} + +// --- String / Bytes writer (variable-length: needs safe Append) --- + +template +class StringLikeWriter final : public FieldWriter +{ +public: + StringLikeWriter( const std::string & columnName, const StructFieldPtr & field, + std::shared_ptr<::arrow::DataType> dataType ) + : FieldWriter( columnName, field, std::make_shared(), std::move( dataType ) ), + m_typedBuilder( static_cast( m_builder.get() ) ) {} + + void writeAll( const std::vector & structs, int64_t offset, int64_t count ) override + { + for( int64_t i = offset; i < offset + count; ++i ) + { + const Struct * s = structs[i].get(); + if( m_field -> isSet( s ) ) + { + auto & val = m_field -> value( s ); + ARROW_OK_OR_THROW( m_typedBuilder -> Append( val.c_str(), val.length() ), "Failed to append string/bytes" ); + } + else + ARROW_OK_OR_THROW( m_typedBuilder -> AppendNull(), "Failed to append null" ); + } + } + +protected: + void doWrite( const Struct * s ) override + { + auto & val = m_field -> value( s ); + ARROW_OK_OR_THROW( m_typedBuilder -> Append( val.c_str(), val.length() ), "Failed to append string/bytes" ); + } + +private: + ArrowBuilderT * m_typedBuilder; +}; + +// --- Enum writer (variable-length string: CspEnum → name()) --- + +class EnumWriter final : public FieldWriter +{ +public: + EnumWriter( const std::string & columnName, const StructFieldPtr & field ) + : FieldWriter( columnName, field, std::make_shared<::arrow::StringBuilder>(), ::arrow::utf8() ), + m_typedBuilder( static_cast<::arrow::StringBuilder *>( m_builder.get() ) ) {} + + void writeAll( const std::vector & structs, int64_t offset, int64_t count ) override + { + for( int64_t i = offset; i < offset + count; ++i ) + { + const Struct * s = structs[i].get(); + if( m_field -> isSet( s ) ) + { + auto & n = m_field -> value( s ).name(); + ARROW_OK_OR_THROW( m_typedBuilder -> Append( n.c_str(), n.length() ), "Failed to append enum" ); + } + else + ARROW_OK_OR_THROW( m_typedBuilder -> AppendNull(), "Failed to append null" ); + } + } + +protected: + void doWrite( const Struct * s ) override + { + auto & n = m_field -> value( s ).name(); + ARROW_OK_OR_THROW( m_typedBuilder -> Append( n.c_str(), n.length() ), "Failed to append enum" ); + } + +private: + ::arrow::StringBuilder * m_typedBuilder; +}; + +// --- Nested struct writer (recursive) --- + +class NestedStructWriter final : public FieldWriter +{ +public: + NestedStructWriter( const std::string & columnName, const StructFieldPtr & field, + std::shared_ptr<::arrow::StructBuilder> structBuilder, + std::shared_ptr<::arrow::DataType> structType, + std::vector> childWriters ) + : FieldWriter( columnName, field, structBuilder, std::move( structType ) ), + m_structBuilder( structBuilder.get() ), + m_childWriters( std::move( childWriters ) ) {} + + void reserve( int64_t numRows ) override + { + ARROW_OK_OR_THROW( m_builder -> Reserve( numRows ), "Failed to reserve builder capacity" ); + for( auto & cw : m_childWriters ) + cw -> reserve( numRows ); + } + + void writeNull() override + { + for( auto & cw : m_childWriters ) + cw -> writeNull(); + ARROW_OK_OR_THROW( m_structBuilder -> AppendNull(), "Failed to append null struct" ); + } + + void writeAll( const std::vector & structs, int64_t offset, int64_t count ) override + { + // Check if any parent struct has a null nested value + bool hasNulls = false; + for( int64_t i = offset; i < offset + count && !hasNulls; ++i ) + hasNulls = !m_field -> isSet( structs[i].get() ); + + if( !hasNulls ) + { + // Fast path: all nested values are set — columnar child writes + std::vector nested( count ); + for( int64_t i = 0; i < count; ++i ) + nested[i] = m_field -> value( structs[offset + i].get() ); + for( auto & cw : m_childWriters ) + cw -> writeAll( nested, 0, count ); + ARROW_OK_OR_THROW( m_structBuilder -> AppendValues( count, nullptr ), "Failed to append struct validity" ); + } + else + { + for( int64_t i = offset; i < offset + count; ++i ) + writeNext( structs[i].get() ); + } + } + +protected: + void doWrite( const Struct * s ) override + { + auto & nested = m_field -> value( s ); + for( auto & cw : m_childWriters ) + cw -> writeNext( nested.get() ); + ARROW_OK_OR_THROW( m_structBuilder -> Append(), "Failed to append struct" ); + } + +private: + ::arrow::StructBuilder * m_structBuilder; + std::vector> m_childWriters; +}; + +// --- Factory helpers --- + +bool isBytesField( const StructFieldPtr & field ) +{ + if( field -> type() -> type() != CspType::Type::STRING ) + return false; + auto strType = std::static_pointer_cast( field -> type() ); + return strType && strType -> isBytes(); +} + +template +CreatedFieldWriter makeWriter( Args &&... args ) +{ + auto w = std::make_unique( std::forward( args )... ); + auto b = w -> builder(); + return { std::move( w ), std::move( b ) }; +} + +} // anonymous namespace + +CreatedFieldWriter createFieldWriter( + const std::string & columnName, + const StructFieldPtr & structField ) +{ + auto & f = structField; + + switch( f -> type() -> type() ) + { + // --- Numeric --- + case CspType::Type::BOOL: + { + auto b = std::make_shared<::arrow::BooleanBuilder>(); + return makeUnsafeWriter( columnName, f, b, ::arrow::boolean(), + [f]( const Struct * s ) { return f -> value( s ); } ); + } + case CspType::Type::INT8: return makePrimitiveWriter( columnName, f ); + case CspType::Type::INT16: return makePrimitiveWriter( columnName, f ); + case CspType::Type::INT32: return makePrimitiveWriter( columnName, f ); + case CspType::Type::INT64: return makePrimitiveWriter( columnName, f ); + case CspType::Type::UINT8: return makePrimitiveWriter( columnName, f ); + case CspType::Type::UINT16: return makePrimitiveWriter( columnName, f ); + case CspType::Type::UINT32: return makePrimitiveWriter( columnName, f ); + case CspType::Type::UINT64: return makePrimitiveWriter( columnName, f ); + case CspType::Type::DOUBLE: return makePrimitiveWriter( columnName, f ); + + // --- String / Bytes --- + case CspType::Type::STRING: + if( isBytesField( f ) ) + return makeWriter>( columnName, f, ::arrow::binary() ); + return makeWriter>( columnName, f, ::arrow::utf8() ); + + case CspType::Type::ENUM: return makeWriter( columnName, f ); + + // --- Temporal --- + case CspType::Type::DATETIME: + return makeNanosWriter( + columnName, f, std::make_shared<::arrow::TimestampType>( ::arrow::TimeUnit::NANO, "UTC" ) ); + case CspType::Type::TIMEDELTA: + return makeNanosWriter( + columnName, f, std::make_shared<::arrow::DurationType>( ::arrow::TimeUnit::NANO ) ); + case CspType::Type::TIME: + return makeNanosWriter( + columnName, f, std::make_shared<::arrow::Time64Type>( ::arrow::TimeUnit::NANO ) ); + + // --- Date (days since epoch) --- + case CspType::Type::DATE: + { + auto b = std::make_shared<::arrow::Date32Builder>(); + return makeUnsafeWriter( columnName, f, b, ::arrow::date32(), [f]( const Struct * s ) { + auto & d = f -> value( s ); + return static_cast( DateTime( d.year(), d.month(), d.day() ).asNanoseconds() / csp::NANOS_PER_DAY ); + } ); + } + + // --- Nested struct --- + case CspType::Type::STRUCT: + { + auto nestedMeta = std::static_pointer_cast( f -> type() ) -> meta(); + + std::vector> arrowFields; + std::vector> childBuilders; + std::vector> childWriters; + + // Use fieldNames() for stable insertion order (fields() is sorted for memory layout) + for( auto & subFieldName : nestedMeta -> fieldNames() ) + { + auto subField = nestedMeta -> field( subFieldName ); + auto child = createFieldWriter( subFieldName, subField ); + arrowFields.push_back( std::make_shared<::arrow::Field>( subFieldName, child.writer -> dataTypes()[0] ) ); + childBuilders.push_back( std::move( child.builder ) ); + childWriters.push_back( std::move( child.writer ) ); + } + + auto structType = std::make_shared<::arrow::StructType>( arrowFields ); + auto structBuilder = std::make_shared<::arrow::StructBuilder>( + structType, ::arrow::default_memory_pool(), childBuilders ); + + auto w = std::make_unique( + columnName, f, structBuilder, + std::static_pointer_cast<::arrow::DataType>( structType ), std::move( childWriters ) ); + return { std::move( w ), std::move( structBuilder ) }; + } + + default: + CSP_THROW( TypeError, "Unsupported CSP type " << f -> type() -> type() + << " for field '" << columnName << "'" ); + } +} + +#undef ARROW_OK_OR_THROW + +} diff --git a/cpp/csp/adapters/arrow/ArrowFieldWriter.h b/cpp/csp/adapters/arrow/ArrowFieldWriter.h new file mode 100644 index 000000000..73c85f99c --- /dev/null +++ b/cpp/csp/adapters/arrow/ArrowFieldWriter.h @@ -0,0 +1,103 @@ +// Per-column writers that serialize CSP struct field values into Arrow array builders. +// +// FieldWriter is the base class. Every writer targets exactly one struct field. +// Scalar writers use the single-column constructor with a builder; multi-column +// writers (e.g. NDArray with data + dims) use the multi-column constructor. +// The non-virtual writeNext() checks isSet and delegates to the virtual doWrite(). + +#ifndef _IN_CSP_ADAPTERS_ARROW_ArrowFieldWriter_H +#define _IN_CSP_ADAPTERS_ARROW_ArrowFieldWriter_H + +#include +#include +#include +#include +#include +#include + +namespace csp::adapters::arrow +{ + +class FieldWriter +{ +public: + virtual ~FieldWriter() = default; + + // Pre-allocate builder capacity for numRows. + // Default reserves on m_builder; override for multi-builder writers. + virtual void reserve( int64_t numRows ); + + // Write one struct's field value into the builder. + // Non-virtual: checks isSet, delegates to doWrite() or appendNull(). + void writeNext( const Struct * s ); + + // Columnar bulk-write: write a range of structs into the builder. + // Default loops over writeNext(); concrete writers override with tight loops. + virtual void writeAll( const std::vector & structs, int64_t offset, int64_t count ); + + // Write a null value (used by nested struct writer when parent is null). + // Default appends null to m_builder; NestedStructWriter overrides. + virtual void writeNull(); + + // Finalize and return the built arrays. + // Default finishes m_builder and returns a single array. + virtual std::vector> finish(); + + // Column names produced by this writer. + const std::vector & columnNames() const { return m_columnNames; } + + // Arrow data types per column. + const std::vector> & dataTypes() const { return m_dataTypes; } + + // Access the primary builder (needed by NestedStructWriter for StructBuilder construction). + const std::shared_ptr<::arrow::ArrayBuilder> & builder() const { return m_builder; } + + // Constructor for scalar writers (single column, single struct field, one builder) + FieldWriter( const std::string & columnName, + const StructFieldPtr & field, + std::shared_ptr<::arrow::ArrayBuilder> builder, + std::shared_ptr<::arrow::DataType> dataType ) + : m_field( field ), + m_builder( std::move( builder ) ), + m_columnNames( { columnName } ), + m_dataTypes( { std::move( dataType ) } ) + { + } + + // Constructor for multi-column writers (e.g. NDArray with data + dims columns) + FieldWriter( std::vector columnNames, + std::vector> dataTypes, + const StructFieldPtr & field ) + : m_field( field ), + m_columnNames( std::move( columnNames ) ), + m_dataTypes( std::move( dataTypes ) ) + { + } + +protected: + // Write the field value when it is set. Concrete scalar writers implement this. + virtual void doWrite( const Struct * s ) = 0; + + StructFieldPtr m_field; + std::shared_ptr<::arrow::ArrayBuilder> m_builder; + std::vector m_columnNames; + std::vector> m_dataTypes; +}; + +// Return type for createFieldWriter: the writer plus its primary builder +// (the builder is needed by nested struct writer to construct StructBuilder with child builders) +struct CreatedFieldWriter +{ + std::unique_ptr writer; + std::shared_ptr<::arrow::ArrayBuilder> builder; +}; + +// Factory: given column name + struct field, produce a FieldWriter with its builder +CreatedFieldWriter createFieldWriter( + const std::string & columnName, + const StructFieldPtr & structField +); + +} + +#endif diff --git a/cpp/csp/adapters/arrow/CMakeLists.txt b/cpp/csp/adapters/arrow/CMakeLists.txt new file mode 100644 index 000000000..3ff0bb4e7 --- /dev/null +++ b/cpp/csp/adapters/arrow/CMakeLists.txt @@ -0,0 +1,25 @@ +set(ARROW_ADAPTER_HEADER_FILES + ArrowFieldReader.h + ArrowFieldWriter.h + RecordBatchToStruct.h + StructToRecordBatch.h +) + +set(ARROW_ADAPTER_SOURCE_FILES + ArrowFieldReader.cpp + ArrowFieldWriter.cpp + RecordBatchToStruct.cpp + StructToRecordBatch.cpp + ${ARROW_ADAPTER_HEADER_FILES} +) + +add_library(csp_arrow_adapter STATIC ${ARROW_ADAPTER_SOURCE_FILES}) +set_target_properties(csp_arrow_adapter PROPERTIES PUBLIC_HEADER "${ARROW_ADAPTER_HEADER_FILES}") + +target_link_libraries(csp_arrow_adapter PRIVATE csp_core csp_types csp_engine ${CSP_ARROW_LINK_LIBS}) + +install(TARGETS csp_arrow_adapter + PUBLIC_HEADER DESTINATION include/csp/adapters/arrow + RUNTIME DESTINATION ${CSP_RUNTIME_INSTALL_SUBDIR} + LIBRARY DESTINATION lib/ + ) diff --git a/cpp/csp/adapters/arrow/RecordBatchToStruct.cpp b/cpp/csp/adapters/arrow/RecordBatchToStruct.cpp new file mode 100644 index 000000000..5ba2002ff --- /dev/null +++ b/cpp/csp/adapters/arrow/RecordBatchToStruct.cpp @@ -0,0 +1,93 @@ +// Implementation of RecordBatchToStructConverter. + +#include +#include + +#include + +#include + +namespace csp::adapters::arrow +{ + +namespace +{ + +// Helper to resolve column name -> struct field name mapping +// If fieldMap is null, column name = field name (identity mapping) +std::string resolveFieldName( const DictionaryPtr & fieldMap, const std::string & columnName ) +{ + if( !fieldMap ) + return columnName; + + std::string fieldName; + if( fieldMap -> tryGet( columnName, fieldName ) ) + return fieldName; + + return columnName; +} + +} // anonymous namespace + +RecordBatchToStructConverter::RecordBatchToStructConverter( + const std::shared_ptr<::arrow::Schema> & schema, + const std::shared_ptr & structMeta, + const DictionaryPtr & fieldMap, + std::vector> customReaders ) + : m_structMeta( structMeta ) +{ + // Build a set of column names handled by custom readers so we skip them in the scalar loop + std::unordered_set customColumnNames; + for( auto & cr : customReaders ) + for( auto & name : cr -> columnNames() ) + customColumnNames.insert( name ); + + // Build scalar field readers from schema fields + for( int i = 0; i < schema -> num_fields(); ++i ) + { + auto arrowField = schema -> field( i ); + + // Skip columns handled by custom readers + if( customColumnNames.count( arrowField -> name() ) ) + continue; + + // Skip columns that don't have a matching struct field + std::string fieldName = resolveFieldName( fieldMap, arrowField -> name() ); + auto structField = structMeta -> field( fieldName ); + if( !structField ) + continue; + + m_scalarReaders.push_back( { createFieldReader( arrowField, structField ), i } ); + } + + // Store custom readers separately + m_customReaders = std::move( customReaders ); +} + +std::vector RecordBatchToStructConverter::convert( const ::arrow::RecordBatch & batch ) +{ + int64_t numRows = batch.num_rows(); + + // Phase 1: pre-allocate all structs + std::vector result; + result.reserve( numRows ); + for( int64_t i = 0; i < numRows; ++i ) + result.push_back( m_structMeta -> create() ); + + // Phase 2: columnar read — one readAll() call per column + for( auto & entry : m_scalarReaders ) + { + entry.reader -> bindColumn( batch.column( entry.columnIndex ).get() ); + entry.reader -> readAll( result, numRows ); + } + + for( auto & reader : m_customReaders ) + { + reader -> bindBatch( batch ); + reader -> readAll( result, numRows ); + } + + return result; +} + +} diff --git a/cpp/csp/adapters/arrow/RecordBatchToStruct.h b/cpp/csp/adapters/arrow/RecordBatchToStruct.h new file mode 100644 index 000000000..e851792ff --- /dev/null +++ b/cpp/csp/adapters/arrow/RecordBatchToStruct.h @@ -0,0 +1,52 @@ +// Converts Arrow RecordBatches into csp::Struct instances. +// +// RecordBatchToStructConverter maps Arrow columns to CSP struct fields using +// FieldReader subclasses. Scalar readers are auto-detected from the schema; +// additional readers (e.g. numpy array readers) can be injected at construction. + +#ifndef _IN_CSP_ADAPTERS_ARROW_RecordBatchToStruct_H +#define _IN_CSP_ADAPTERS_ARROW_RecordBatchToStruct_H + +#include +#include +#include +#include +#include +#include + +namespace csp::adapters::arrow +{ + +class RecordBatchToStructConverter +{ +public: + // schema: Arrow schema describing the RecordBatch columns + // structMeta: CSP StructMeta describing the target struct type + // fieldMap: optional column->field name mapping (null = match by column name) + // customReaders: additional readers for columns that need non-scalar handling (e.g. numpy arrays); + // columns claimed by these readers are excluded from scalar auto-detection + RecordBatchToStructConverter( + const std::shared_ptr<::arrow::Schema> & schema, + const std::shared_ptr & structMeta, + const DictionaryPtr & fieldMap = nullptr, + std::vector> customReaders = {} + ); + + // Convert all rows from a RecordBatch into a vector of CSP Structs + std::vector convert( const ::arrow::RecordBatch & batch ); + +private: + struct ScalarReaderEntry + { + std::unique_ptr reader; + int columnIndex; + }; + + std::shared_ptr m_structMeta; + std::vector m_scalarReaders; + std::vector> m_customReaders; +}; + +} + +#endif diff --git a/cpp/csp/adapters/arrow/StructToRecordBatch.cpp b/cpp/csp/adapters/arrow/StructToRecordBatch.cpp new file mode 100644 index 000000000..526737c11 --- /dev/null +++ b/cpp/csp/adapters/arrow/StructToRecordBatch.cpp @@ -0,0 +1,109 @@ +// Implementation of StructToRecordBatchConverter. + +#include +#include + +#include +#include + +namespace csp::adapters::arrow +{ + +StructToRecordBatchConverter::StructToRecordBatchConverter( + const std::shared_ptr & structMeta, + const DictionaryPtr & fieldMap, + std::vector> customWriters ) + : m_structMeta( structMeta ) +{ + std::vector> arrowFields; + + if( fieldMap ) + { + // When fieldMap is provided, only include fields listed in it + for( auto it = fieldMap -> begin(); it != fieldMap -> end(); ++it ) + { + auto fieldName = it.key(); + auto colName = it.value(); + auto structField = structMeta -> field( fieldName ); + if( !structField ) + continue; + + // Skip DIALECT_GENERIC fields (handled by custom writers) + if( structField -> type() -> type() == CspType::Type::DIALECT_GENERIC ) + continue; + + auto created = createFieldWriter( colName, structField ); + for( auto & dt : created.writer -> dataTypes() ) + arrowFields.push_back( std::make_shared<::arrow::Field>( colName, dt ) ); + m_writers.push_back( std::move( created.writer ) ); + } + } + else + { + // No fieldMap: include all non-DIALECT_GENERIC fields using fieldNames() for stable insertion order + // (fields() is sorted by type/size for memory layout optimization, not insertion order) + for( auto & fieldName : structMeta -> fieldNames() ) + { + auto structField = structMeta -> field( fieldName ); + if( !structField || structField -> type() -> type() == CspType::Type::DIALECT_GENERIC ) + continue; + + auto created = createFieldWriter( fieldName, structField ); + for( auto & dt : created.writer -> dataTypes() ) + arrowFields.push_back( std::make_shared<::arrow::Field>( fieldName, dt ) ); + m_writers.push_back( std::move( created.writer ) ); + } + } + + // Append custom writers and their columns to schema + for( auto & cw : customWriters ) + { + auto & names = cw -> columnNames(); + auto & types = cw -> dataTypes(); + CSP_TRUE_OR_THROW_RUNTIME( names.size() == types.size(), + "FieldWriter columnNames and dataTypes must have the same size" ); + for( size_t i = 0; i < names.size(); ++i ) + arrowFields.push_back( std::make_shared<::arrow::Field>( names[i], types[i] ) ); + m_writers.push_back( std::move( cw ) ); + } + + m_schema = std::make_shared<::arrow::Schema>( arrowFields ); +} + +std::vector> StructToRecordBatchConverter::convert( + const std::vector & structs, int64_t maxBatchSize ) +{ + int64_t totalRows = static_cast( structs.size() ); + if( maxBatchSize <= 0 ) + maxBatchSize = totalRows; + + std::vector> batches; + + for( int64_t offset = 0; offset < totalRows; offset += maxBatchSize ) + { + int64_t chunkRows = std::min( maxBatchSize, totalRows - offset ); + + // Columnar write: reserve + writeAll per column keeps builder memory cache-hot + for( auto & writer : m_writers ) + { + writer -> reserve( chunkRows ); + writer -> writeAll( structs, offset, chunkRows ); + } + + std::vector> arrays; + arrays.reserve( m_schema -> num_fields() ); + for( auto & writer : m_writers ) + { + auto writerArrays = writer -> finish(); + arrays.insert( arrays.end(), + std::make_move_iterator( writerArrays.begin() ), + std::make_move_iterator( writerArrays.end() ) ); + } + + batches.push_back( ::arrow::RecordBatch::Make( m_schema, chunkRows, std::move( arrays ) ) ); + } + + return batches; +} + +} diff --git a/cpp/csp/adapters/arrow/StructToRecordBatch.h b/cpp/csp/adapters/arrow/StructToRecordBatch.h new file mode 100644 index 000000000..27a7b8ead --- /dev/null +++ b/cpp/csp/adapters/arrow/StructToRecordBatch.h @@ -0,0 +1,50 @@ +// Converts csp::Struct instances into Arrow RecordBatches. +// +// StructToRecordBatchConverter maps CSP struct fields to Arrow columns using +// FieldWriter subclasses. Scalar writers are auto-detected from the struct +// metadata; additional writers (e.g. numpy array writers) can be injected at +// construction. Mirrors RecordBatchToStruct in the read direction. + +#ifndef _IN_CSP_ADAPTERS_ARROW_StructToRecordBatch_H +#define _IN_CSP_ADAPTERS_ARROW_StructToRecordBatch_H + +#include +#include +#include +#include +#include +#include + +namespace csp::adapters::arrow +{ + +class StructToRecordBatchConverter +{ +public: + // structMeta: CSP StructMeta describing the source struct type + // fieldMap: optional field->column name mapping (null = match by name, include all non-DIALECT_GENERIC fields) + // when provided, only scalar fields listed in fieldMap are included + // customWriters: additional writers for fields that need non-scalar handling (e.g. numpy arrays) + StructToRecordBatchConverter( + const std::shared_ptr & structMeta, + const DictionaryPtr & fieldMap = nullptr, + std::vector> customWriters = {} + ); + + // Convert a vector of CSP Structs into one or more RecordBatches. + // maxBatchSize controls the maximum number of rows per batch (0 = no limit). + std::vector> convert( const std::vector & structs, + int64_t maxBatchSize = 0 ); + + // Get the Arrow schema for the output + const std::shared_ptr<::arrow::Schema> & schema() const { return m_schema; } + +private: + std::shared_ptr m_structMeta; + std::shared_ptr<::arrow::Schema> m_schema; + std::vector> m_writers; +}; + +} + +#endif diff --git a/cpp/csp/adapters/parquet/CMakeLists.txt b/cpp/csp/adapters/parquet/CMakeLists.txt index 0d029c397..528ebd588 100644 --- a/cpp/csp/adapters/parquet/CMakeLists.txt +++ b/cpp/csp/adapters/parquet/CMakeLists.txt @@ -43,37 +43,7 @@ add_library(csp_parquet_adapter STATIC ${PARQUET_SOURCE_FILES}) set_target_properties(csp_parquet_adapter PROPERTIES PUBLIC_HEADER "${PARQUET_HEADER_FILES}") target_include_directories(csp_parquet_adapter PRIVATE ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR}) -find_package(Arrow REQUIRED) -find_package(Parquet REQUIRED) -find_package(Thrift REQUIRED) -find_package(lz4 REQUIRED) -find_package(utf8proc REQUIRED) -find_package(Brotli REQUIRED) - -if(WIN32) - if(CSP_USE_VCPKG) - set(ARROW_PACKAGES_TO_LINK Arrow::arrow_static Parquet::parquet_static ) - target_compile_definitions(csp_parquet_adapter PUBLIC ARROW_STATIC) - target_compile_definitions(csp_parquet_adapter PUBLIC PARQUET_STATIC) - else() - # use dynamic variants - # Until we manage to get the fix for ws3_32.dll in arrow-16 into conda, manually fix the error here - get_target_property(LINK_LIBS Arrow::arrow_shared INTERFACE_LINK_LIBRARIES) - string(REPLACE "ws2_32.dll" "ws2_32" FIXED_LINK_LIBS "${LINK_LIBS}") - set_target_properties(Arrow::arrow_shared PROPERTIES INTERFACE_LINK_LIBRARIES "${FIXED_LINK_LIBS}") - set(ARROW_PACKAGES_TO_LINK parquet_shared arrow_shared) - endif() -else() - if(CSP_USE_VCPKG) - # use static variants - set(ARROW_PACKAGES_TO_LINK parquet_static arrow_static) - else() - # use dynamic variants - set(ARROW_PACKAGES_TO_LINK parquet arrow) - endif() -endif() - -target_link_libraries(csp_parquet_adapter PRIVATE csp_adapter_utils thrift::thrift lz4::lz4 utf8proc::utf8proc ${BROTLI_STATIC_LIB} ${ARROW_PACKAGES_TO_LINK}) +target_link_libraries(csp_parquet_adapter PRIVATE csp_adapter_utils thrift::thrift lz4::lz4 utf8proc::utf8proc ${BROTLI_STATIC_LIB} ${CSP_PARQUET_LINK_LIBS} ${CSP_ARROW_LINK_LIBS}) install(TARGETS csp_parquet_adapter PUBLIC_HEADER DESTINATION include/csp/adapters/parquet diff --git a/cpp/csp/python/CMakeLists.txt b/cpp/csp/python/CMakeLists.txt index 195b99332..beadc2bce 100644 --- a/cpp/csp/python/CMakeLists.txt +++ b/cpp/csp/python/CMakeLists.txt @@ -90,7 +90,8 @@ target_compile_definitions(cspimpl PRIVATE CSPIMPL_EXPORTS=1) ## Baselib c++ module add_library(cspbaselibimpl SHARED cspbaselibimpl.cpp) -target_link_libraries(cspbaselibimpl cspimpl baselibimpl) + +target_link_libraries(cspbaselibimpl cspimpl baselibimpl csp_arrow_adapter ${CSP_ARROW_LINK_LIBS}) # Include exprtk include directory for exprtk node target_include_directories(cspbaselibimpl PRIVATE ${EXPRTK_INCLUDE_DIRS}) diff --git a/cpp/csp/python/adapters/ArrowCppNodes.cpp b/cpp/csp/python/adapters/ArrowCppNodes.cpp new file mode 100644 index 000000000..1b4c68ca3 --- /dev/null +++ b/cpp/csp/python/adapters/ArrowCppNodes.cpp @@ -0,0 +1,356 @@ +// CppNode implementations for record_batches_to_struct and struct_to_record_batches. +// +// These nodes bridge the Python/CSP graph layer with the C++ Arrow converters. +// RecordBatches are transported across the Python/C++ boundary as PyCapsules +// using the Arrow C Data Interface. + +// Must include numpy first without NO_IMPORT_ARRAY to define PyArray_API in this TU +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace csp::adapters::arrow; + +namespace csp::cppnodes +{ + +// Reshape callback that uses numpy's PyArray_Newshape (avoids allocating a dims array per call) +static DialectGenericType numpyReshape( DialectGenericType flatData, const std::vector & dims ) +{ + // PyArray_Newshape accepts a PyArray_Dims struct directly, avoiding a numpy array allocation for dims. + // Stack-allocate the dims for typical 2-8 dimensional arrays. + npy_intp dimsBuf[8]; + npy_intp ndims = static_cast( dims.size() ); + npy_intp * dimsPtr; + + std::vector heapDims; + if( ndims <= 8 ) + { + for( npy_intp i = 0; i < ndims; ++i ) + dimsBuf[i] = static_cast( dims[i] ); + dimsPtr = dimsBuf; + } + else + { + heapDims.resize( ndims ); + for( npy_intp i = 0; i < ndims; ++i ) + heapDims[i] = static_cast( dims[i] ); + dimsPtr = heapDims.data(); + } + + PyArray_Dims newshape{ dimsPtr, static_cast( ndims ) }; + auto * flatPyArr = reinterpret_cast( csp::python::toPythonBorrowed( flatData ) ); + csp::python::PyObjectPtr reshaped{ csp::python::PyObjectPtr::own( + reinterpret_cast( PyArray_Newshape( flatPyArr, &newshape, NPY_CORDER ) ) ) }; + if( !reshaped.get() ) + CSP_THROW( csp::python::PythonPassthrough, "" ); + + return csp::python::fromPython( reshaped.get() ); +} + +// Shape callback: extract shape from a numpy NDArray +static std::vector numpyShape( DialectGenericType ndarray ) +{ + auto * pyArr = reinterpret_cast( csp::python::toPythonBorrowed( ndarray ) ); + int ndim = PyArray_NDIM( pyArr ); + npy_intp * shape = PyArray_SHAPE( pyArr ); + std::vector result( ndim ); + for( int i = 0; i < ndim; ++i ) + result[i] = shape[i]; + return result; +} + +// Convert a Python type object (passed as DialectGenericType) to an NPY type constant +// using the same pyTypeAsCspType + PartialSwitchCspType pattern as the parquet adapter. +static int npyTypeFromPyType( DialectGenericType pyTypeObj ) +{ + auto & cspType = csp::python::pyTypeAsCspType( csp::python::toPythonBorrowed( pyTypeObj ) ); + return csp::PartialSwitchCspType::invoke( + cspType.get(), + []( auto tag ) -> int + { + using CValueType = typename decltype( tag )::type; + return csp::python::NPY_TYPE::value; + } + ); +} + +// PyCapsule destructors for ArrowSchema/ArrowArray (local copies to avoid including ArrowInputAdapter.h) +static void releaseArrowSchemaCapsule( PyObject * capsule ) +{ + ArrowSchema * schema = reinterpret_cast( PyCapsule_GetPointer( capsule, "arrow_schema" ) ); + if( schema -> release != NULL ) + schema -> release( schema ); + free( schema ); +} + +static void releaseArrowArrayCapsule( PyObject * capsule ) +{ + ArrowArray * array = reinterpret_cast( PyCapsule_GetPointer( capsule, "arrow_array" ) ); + if( array -> release != NULL ) + array -> release( array ); + free( array ); +} + +DECLARE_CPPNODE( record_batches_to_struct ) +{ + SCALAR_INPUT( DialectGenericType, schema_ptr ); // PyCapsule of ArrowSchema + SCALAR_INPUT( StructMetaPtr, cls ); // target struct type + SCALAR_INPUT( DictionaryPtr, properties ); // field_map, numpy config + TS_INPUT( Generic, data ); // List[Tuple[capsule, capsule]] + TS_OUTPUT( Generic ); // List[StructPtr] + + STATE_VAR( std::unique_ptr, s_converter ); + STATE_VAR( std::shared_ptr<::arrow::Schema>, s_schema ); + + INIT_CPPNODE( record_batches_to_struct ) {} + + START() + { + // Import the Arrow schema from PyCapsule + PyObject * pySchemaCapsule = csp::python::toPythonBorrowed( schema_ptr.value() ); + if( !PyCapsule_IsValid( pySchemaCapsule, "arrow_schema" ) ) + CSP_THROW( csp::TypeError, "schema_ptr must be a PyCapsule with name 'arrow_schema'" ); + + ArrowSchema * c_schema = reinterpret_cast( PyCapsule_GetPointer( pySchemaCapsule, "arrow_schema" ) ); + auto schemaResult = ::arrow::ImportSchema( c_schema ); + if( !schemaResult.ok() ) + CSP_THROW( csp::ValueError, "Failed to import Arrow schema: " << schemaResult.status().ToString() ); + s_schema = std::move( schemaResult.ValueUnsafe() ); + + // Parse properties + auto & props = properties.value(); + DictionaryPtr fieldMap; + if( props -> exists( "field_map" ) ) + fieldMap = props -> get( "field_map" ); + + // Optional: mapping of arrow_col_name -> dims_col_name for NDArray columns + DictionaryPtr numpyDimensionNames; + if( props -> exists( "numpy_dimension_names" ) ) + numpyDimensionNames = props -> get( "numpy_dimension_names" ); + + auto structMeta = cls.value(); + + // Build FieldReader entries for numpy array fields + std::vector> customReaders; + + if( props -> exists( "numpy_fields" ) ) + { + auto numpyFields = props -> get( "numpy_fields" ); + + for( auto it = numpyFields -> begin(); it != numpyFields -> end(); ++it ) + { + auto colName = it.key(); + auto fieldName = it.value(); + auto structField = structMeta -> field( fieldName ); + CSP_TRUE_OR_THROW_RUNTIME( structField != nullptr, + "Struct field '" << fieldName << "' not found on struct type '" << structMeta -> name() << "'" ); + + if( numpyDimensionNames && numpyDimensionNames -> exists( colName ) ) + { + auto dimsColName = numpyDimensionNames -> get( colName ); + customReaders.push_back( + csp::python::createNumpyNDArrayReader( s_schema, colName, dimsColName, structField, numpyReshape ) ); + } + else + { + customReaders.push_back( + csp::python::createNumpyArrayReader( s_schema, colName, structField ) ); + } + } + } + + s_converter = std::make_unique( s_schema, structMeta, fieldMap, std::move( customReaders ) ); + } + + INVOKE() + { + if( !data.ticked() ) + return; + + // data is a list of (schema_capsule, array_capsule) tuples + PyObject * pyList = csp::python::toPythonBorrowed( data.lastValue() ); + if( !PyList_Check( pyList ) ) + CSP_THROW( csp::TypeError, "Expected list of PyCapsule tuples, got " << Py_TYPE( pyList ) -> tp_name ); + + Py_ssize_t numBatches = PyList_Size( pyList ); + + // First pass: import all record batches and compute total row count for a single reserve() + std::vector> importedBatches; + importedBatches.reserve( numBatches ); + int64_t totalRows = 0; + + for( Py_ssize_t i = 0; i < numBatches; ++i ) + { + PyObject * pyTuple = PyList_GetItem( pyList, i ); + if( !PyTuple_Check( pyTuple ) || PyTuple_Size( pyTuple ) != 2 ) + CSP_THROW( csp::TypeError, "Expected tuple of 2 PyCapsules for record batch " << i ); + + PyObject * pyArray = PyTuple_GetItem( pyTuple, 1 ); + if( !PyCapsule_IsValid( pyArray, "arrow_array" ) ) + CSP_THROW( csp::TypeError, "Invalid PyCapsule for record batch array at index " << i ); + + ArrowArray * c_array = reinterpret_cast( PyCapsule_GetPointer( pyArray, "arrow_array" ) ); + auto rbResult = ::arrow::ImportRecordBatch( c_array, s_schema ); + if( !rbResult.ok() ) + CSP_THROW( csp::ValueError, "Failed to import record batch at index " << i << ": " << rbResult.status().ToString() ); + + auto rb = std::move( rbResult.ValueUnsafe() ); + totalRows += rb -> num_rows(); + importedBatches.push_back( std::move( rb ) ); + } + + // Second pass: convert all batches into structs with a single pre-allocated vector + std::vector allStructs; + allStructs.reserve( totalRows ); + + for( auto & rb : importedBatches ) + { + auto structs = s_converter -> convert( *rb ); + allStructs.insert( allStructs.end(), + std::make_move_iterator( structs.begin() ), + std::make_move_iterator( structs.end() ) ); + } + + // Output as std::vector to match the List["T"] output buffer type + // where "T" resolves to a csp.Struct -> CspArrayType(STRUCT) -> std::vector + using ArrayT = std::vector; + ArrayT & out = unnamed_output().reserveSpace(); + out = std::move( allStructs ); + } +}; + +EXPORT_CPPNODE( record_batches_to_struct ); + +DECLARE_CPPNODE( struct_to_record_batches ) +{ + SCALAR_INPUT( StructMetaPtr, cls ); // source struct type + SCALAR_INPUT( DictionaryPtr, properties ); // field_map, numpy config + TS_INPUT( Generic, data ); // vector + TS_OUTPUT( Generic ); // DialectGenericType (Python list of capsule tuples) + + STATE_VAR( std::unique_ptr, s_converter ); + STATE_VAR( int64_t, s_maxBatchSize ); + + INIT_CPPNODE( struct_to_record_batches ) {} + + START() + { + auto & props = properties.value(); + auto structMeta = cls.value(); + + s_maxBatchSize = props -> get( "max_batch_size", 0 ); + + DictionaryPtr fieldMap; + if( props -> exists( "field_map" ) ) + fieldMap = props -> get( "field_map" ); + + DictionaryPtr numpyDimensionNames; + if( props -> exists( "numpy_dimension_names" ) ) + numpyDimensionNames = props -> get( "numpy_dimension_names" ); + + // Build FieldWriter entries for numpy array fields + std::vector> customWriters; + + if( props -> exists( "numpy_fields" ) ) + { + auto numpyFields = props -> get( "numpy_fields" ); + auto numpyElementTypes = props -> get( "numpy_element_types" ); + + for( auto it = numpyFields -> begin(); it != numpyFields -> end(); ++it ) + { + auto colName = it.key(); + auto fieldName = it.value(); + auto structField = structMeta -> field( fieldName ); + CSP_TRUE_OR_THROW_RUNTIME( structField != nullptr, + "Struct field '" << fieldName << "' not found on struct type '" << structMeta -> name() << "'" ); + + auto pyTypeObj = numpyElementTypes -> get( fieldName ); + int npyType = npyTypeFromPyType( pyTypeObj ); + + if( numpyDimensionNames && numpyDimensionNames -> exists( colName ) ) + { + auto dimsColName = numpyDimensionNames -> get( colName ); + customWriters.push_back( + csp::python::createNumpyNDArrayWriter( colName, dimsColName, structField, npyType, numpyShape ) ); + } + else + { + customWriters.push_back( + csp::python::createNumpyArrayWriter( colName, structField, npyType ) ); + } + } + } + + s_converter = std::make_unique( structMeta, fieldMap, std::move( customWriters ) ); + } + + INVOKE() + { + if( !data.ticked() ) + return; + + auto & structs = data.lastValue>(); + auto batches = s_converter -> convert( structs, s_maxBatchSize ); + + auto py_list = csp::python::PyObjectPtr::own( PyList_New( static_cast( batches.size() ) ) ); + + for( size_t idx = 0; idx < batches.size(); ++idx ) + { + ArrowSchema * rb_schema = reinterpret_cast( malloc( sizeof( ArrowSchema ) ) ); + ArrowArray * rb_array = reinterpret_cast( malloc( sizeof( ArrowArray ) ) ); + + ::arrow::Status st = ::arrow::ExportRecordBatch( *batches[idx], rb_array, rb_schema ); + if( !st.ok() ) + { + free( rb_schema ); + free( rb_array ); + CSP_THROW( csp::ValueError, "Failed to export RecordBatch at index " << idx << ": " << st.ToString() ); + } + + auto py_schema = csp::python::PyObjectPtr::own( + PyCapsule_New( rb_schema, "arrow_schema", releaseArrowSchemaCapsule ) ); + auto py_array = csp::python::PyObjectPtr::own( + PyCapsule_New( rb_array, "arrow_array", releaseArrowArrayCapsule ) ); + + PyObject * py_tuple = PyTuple_Pack( 2, py_schema.get(), py_array.get() ); + PyList_SET_ITEM( py_list.get(), static_cast( idx ), py_tuple ); + } + + unnamed_output().output( csp::python::fromPython( py_list.get() ) ); + } +}; + +EXPORT_CPPNODE( struct_to_record_batches ); + +} + +REGISTER_CPPNODE( csp::cppnodes, record_batches_to_struct ); +REGISTER_CPPNODE( csp::cppnodes, struct_to_record_batches ); + +// Initialize numpy array API for this translation unit +static bool _init_numpy = []() +{ + csp::python::InitHelper::instance().registerCallback( + []( PyObject * module ) -> bool + { + import_array1( false ); + return true; + } + ); + return true; +}(); diff --git a/cpp/csp/python/adapters/ArrowNumpyListReader.h b/cpp/csp/python/adapters/ArrowNumpyListReader.h new file mode 100644 index 000000000..c52b51374 --- /dev/null +++ b/cpp/csp/python/adapters/ArrowNumpyListReader.h @@ -0,0 +1,338 @@ +// FieldReader subclasses for reading Arrow list columns into numpy arrays. +// +// Provides createNumpyArrayReader (1D arrays) and createNumpyNDArrayReader +// (N-dimensional arrays with a separate dimensions column + reshape callback). +// Element types supported: float64, int64, bool, string. + +#ifndef _IN_CSP_PYTHON_ADAPTERS_ArrowNumpyListReader_H +#define _IN_CSP_PYTHON_ADAPTERS_ArrowNumpyListReader_H + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace csp::python +{ + +// Callback for reshaping a flat 1D array using dimensions read from another column. +using ReshapeCallback = std::function & dims )>; + +namespace numpy +{ + +// NaN for doubles in list elements; throw for other types on null +template +struct ListValueProvider +{ + template + static T getValue( const std::shared_ptr & arr, int64_t i ) + { + if( !arr -> IsValid( i ) ) + CSP_THROW( ValueError, "Null value in list element at index " << i ); + return arr -> GetView( i ); + } +}; + +template<> +struct ListValueProvider +{ + template + static double getValue( const std::shared_ptr & arr, int64_t i ) + { + if( !arr -> IsValid( i ) ) + return std::numeric_limits::quiet_NaN(); + return arr -> GetView( i ); + } +}; + +// Create a readValue lambda for native-typed list data (INT64, DOUBLE, BOOL) +template +std::function +makeNativeListReadValue() +{ + auto * dtype = PyArray_DescrFromType( NPY_TYPE::value ); + + return [dtype]( const ::arrow::Array & arr, int64_t row ) -> DialectGenericType + { + auto & listArr = static_cast( arr ); + auto values = std::static_pointer_cast( listArr.value_slice( row ) ); + + npy_intp size = values -> length(); + Py_INCREF( dtype ); + PyObject * pyArr = PyArray_SimpleNewFromDescr( 1, &size, dtype ); + PyObjectPtr arrOwner{ PyObjectPtr::own( pyArr ) }; + + auto * buf = reinterpret_cast( PyArray_DATA( reinterpret_cast( pyArr ) ) ); + + // Fast path: bulk memcpy for numeric types when there are no nulls. + // BooleanArray stores packed bits, so memcpy is not applicable for bool. + if constexpr( !std::is_same_v ) + { + if( values -> null_count() == 0 ) + { + std::memcpy( buf, values -> raw_values(), sizeof( CspT ) * size ); + return fromPython( pyArr ); + } + } + + for( int64_t i = 0; i < values -> length(); ++i ) + buf[i] = ListValueProvider::getValue( values, i ); + + return fromPython( pyArr ); + }; +} + +// Create a readValue lambda for string-typed list data +template +std::function +makeStringListReadValue() +{ + // Shared converter object: avoids recreating codecvt facet per row. + // The lambda is called sequentially so sharing a mutable converter is safe. + auto converter = std::make_shared, char32_t>>(); + + return [converter]( const ::arrow::Array & arr, int64_t row ) -> DialectGenericType + { + auto & listArr = static_cast( arr ); + auto values = std::static_pointer_cast( listArr.value_slice( row ) ); + + // First pass: compute max string length (in bytes) + size_t maxLen = 0; + for( int64_t i = 0; i < values -> length(); ++i ) + { + if( values -> IsValid( i ) ) + maxLen = std::max( maxLen, values -> GetView( i ).size() ); + } + + // Create numpy unicode array with dtype "U" + npy_intp size = values -> length(); + PyArray_Descr * typ; + PyObject * typeStringDescr = toPython( std::string( "U" ) + std::to_string( maxLen ) ); + PyArray_DescrConverter( typeStringDescr, &typ ); + Py_DECREF( typeStringDescr ); + + PyObject * pyArr = PyArray_SimpleNewFromDescr( 1, &size, typ ); + PyObjectPtr arrOwner{ PyObjectPtr::own( pyArr ) }; + + auto * arrayObject = reinterpret_cast( pyArr ); + auto elementSize = PyDataType_ELSIZE( PyArray_DESCR( arrayObject ) ); + + for( int64_t i = 0; i < values -> length(); ++i ) + { + auto view = values -> GetView( i ); + auto wideValue = converter -> from_bytes( view.data(), view.data() + view.size() ); + auto nElementsToCopy = std::min( int( elementSize / sizeof( char32_t ) ), int( wideValue.size() + 1 ) ); + std::copy_n( wideValue.c_str(), nElementsToCopy, + reinterpret_cast( PyArray_GETPTR1( arrayObject, i ) ) ); + } + + return fromPython( pyArr ); + }; +} + +// Dispatch on arrow list element type to create the appropriate readValue lambda +inline std::function +dispatchListReadValue( const std::shared_ptr<::arrow::ListType> & listType, const std::string & columnName ) +{ + auto valueTypeId = listType -> value_type() -> id(); + + switch( valueTypeId ) + { + case ::arrow::Type::INT64: + return makeNativeListReadValue(); + case ::arrow::Type::DOUBLE: + return makeNativeListReadValue(); + case ::arrow::Type::BOOL: + return makeNativeListReadValue(); + case ::arrow::Type::STRING: + return makeStringListReadValue<::arrow::StringArray>(); + case ::arrow::Type::BINARY: + return makeStringListReadValue<::arrow::BinaryArray>(); + case ::arrow::Type::LARGE_STRING: + return makeStringListReadValue<::arrow::LargeStringArray>(); + case ::arrow::Type::LARGE_BINARY: + return makeStringListReadValue<::arrow::LargeBinaryArray>(); + default: + CSP_THROW( TypeError, "Unsupported list element type " << listType -> value_type() -> ToString() + << " for list column '" << columnName << "'" ); + } +} + +// Read dimension values from a list column cell +inline std::vector readDimsFromListCell( const ::arrow::ListArray & listArr, int64_t row ) +{ + auto values = listArr.value_slice( row ); + std::vector dims; + dims.reserve( values -> length() ); + + switch( values -> type_id() ) + { + case ::arrow::Type::INT32: + { + auto typed = std::static_pointer_cast<::arrow::Int32Array>( values ); + for( int64_t i = 0; i < typed -> length(); ++i ) + dims.push_back( typed -> Value( i ) ); + break; + } + case ::arrow::Type::INT64: + { + auto typed = std::static_pointer_cast<::arrow::Int64Array>( values ); + for( int64_t i = 0; i < typed -> length(); ++i ) + dims.push_back( typed -> Value( i ) ); + break; + } + default: + CSP_THROW( TypeError, "Dimensions column has unsupported element type: " << values -> type() -> ToString() ); + } + return dims; +} + +// FieldReader for 1D numpy array columns (Arrow list -> numpy 1D array) +class NumpyArrayReader final : public csp::adapters::arrow::FieldReader +{ +public: + NumpyArrayReader( int colIdx, const StructFieldPtr & structField, + std::function readValue, + std::vector columnNames ) + : FieldReader( std::move( columnNames ), structField ), + m_colIdx( colIdx ), + m_readValue( std::move( readValue ) ) + { + } + + void bindBatch( const ::arrow::RecordBatch & batch ) override + { + bindColumn( batch.column( m_colIdx ).get() ); + } + +protected: + void doReadNext( int64_t row, Struct * s ) override + { + auto & listArr = static_cast( *m_column ); + if( listArr.IsValid( row ) ) + { + auto arrayValue = m_readValue( listArr, row ); + m_field -> setValue( s, std::move( arrayValue ) ); + } + } + +private: + int m_colIdx; + std::function m_readValue; +}; + +// FieldReader for NDArray columns (Arrow list + dims column -> numpy NDArray via reshape) +class NumpyNDArrayReader final : public csp::adapters::arrow::FieldReader +{ +public: + NumpyNDArrayReader( int colIdx, int dimsColIdx, const StructFieldPtr & structField, + std::function readValue, + ReshapeCallback reshapeCallback, + std::vector columnNames ) + : FieldReader( std::move( columnNames ), structField ), + m_colIdx( colIdx ), m_dimsColIdx( dimsColIdx ), + m_readValue( std::move( readValue ) ), m_reshapeCallback( std::move( reshapeCallback ) ), + m_dimsColumn( nullptr ) + { + } + + void bindBatch( const ::arrow::RecordBatch & batch ) override + { + bindColumn( batch.column( m_colIdx ).get() ); + m_dimsColumn = batch.column( m_dimsColIdx ).get(); + } + +protected: + void doReadNext( int64_t row, Struct * s ) override + { + auto & listArr = static_cast( *m_column ); + if( listArr.IsValid( row ) ) + { + auto arrayValue = m_readValue( listArr, row ); + + auto & dimsArr = static_cast( *m_dimsColumn ); + if( dimsArr.IsValid( row ) ) + { + auto dims = readDimsFromListCell( dimsArr, row ); + arrayValue = m_reshapeCallback( std::move( arrayValue ), dims ); + } + + m_field -> setValue( s, std::move( arrayValue ) ); + } + } + +private: + int m_colIdx; + int m_dimsColIdx; + std::function m_readValue; + ReshapeCallback m_reshapeCallback; + const ::arrow::Array * m_dimsColumn; +}; + +} // namespace numpy + +// Create a FieldReader for a 1D numpy array column +inline std::unique_ptr createNumpyArrayReader( + const std::shared_ptr<::arrow::Schema> & schema, + const std::string & columnName, + const StructFieldPtr & structField ) +{ + int colIdx = schema -> GetFieldIndex( columnName ); + CSP_TRUE_OR_THROW_RUNTIME( colIdx >= 0, + "List column '" << columnName << "' not found in arrow schema" ); + + auto arrowField = schema -> field( colIdx ); + auto listType = std::static_pointer_cast<::arrow::ListType>( arrowField -> type() ); + CSP_TRUE_OR_THROW_RUNTIME( listType != nullptr, + "Column '" << columnName << "' is not a list type" ); + + auto readValue = numpy::dispatchListReadValue( listType, columnName ); + + return std::make_unique( + colIdx, structField, std::move( readValue ), std::vector{ columnName } ); +} + +// Create a FieldReader for an NDArray column (data + dimensions + reshape) +inline std::unique_ptr createNumpyNDArrayReader( + const std::shared_ptr<::arrow::Schema> & schema, + const std::string & columnName, + const std::string & dimsColumnName, + const StructFieldPtr & structField, + ReshapeCallback reshapeCallback ) +{ + int colIdx = schema -> GetFieldIndex( columnName ); + CSP_TRUE_OR_THROW_RUNTIME( colIdx >= 0, + "List column '" << columnName << "' not found in arrow schema" ); + + int dimsColIdx = schema -> GetFieldIndex( dimsColumnName ); + CSP_TRUE_OR_THROW_RUNTIME( dimsColIdx >= 0, + "Dimensions column '" << dimsColumnName << "' not found in arrow schema" ); + + auto arrowField = schema -> field( colIdx ); + auto listType = std::static_pointer_cast<::arrow::ListType>( arrowField -> type() ); + CSP_TRUE_OR_THROW_RUNTIME( listType != nullptr, + "Column '" << columnName << "' is not a list type" ); + + CSP_TRUE_OR_THROW_RUNTIME( reshapeCallback, + "Dimensions column specified for '" << columnName << "' but no reshape callback provided" ); + + auto readValue = numpy::dispatchListReadValue( listType, columnName ); + + return std::make_unique( + colIdx, dimsColIdx, structField, std::move( readValue ), std::move( reshapeCallback ), + std::vector{ columnName, dimsColumnName } ); +} + +} // namespace csp::python + +#endif diff --git a/cpp/csp/python/adapters/ArrowNumpyListWriter.h b/cpp/csp/python/adapters/ArrowNumpyListWriter.h new file mode 100644 index 000000000..7298450ae --- /dev/null +++ b/cpp/csp/python/adapters/ArrowNumpyListWriter.h @@ -0,0 +1,292 @@ +// FieldWriter subclasses for writing numpy arrays into Arrow list columns. +// +// Provides createNumpyArrayWriter (1D arrays) and createNumpyNDArrayWriter +// (N-dimensional arrays with a separate dimensions column + shape callback). +// Mirrors ArrowNumpyListReader.h in the write direction. +// Element types supported: float64, int64, bool, string. + +#ifndef _IN_CSP_PYTHON_ADAPTERS_ArrowNumpyListWriter_H +#define _IN_CSP_PYTHON_ADAPTERS_ArrowNumpyListWriter_H + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace csp::python +{ + +// Callback to extract shape from an NDArray: returns vector of dimension sizes. +using ShapeCallback = std::function( DialectGenericType ndarray )>; + +namespace numpy +{ + +// Helper macro for arrow status checks +#define ARROW_OK_OR_THROW_WRITER( expr, msg ) \ + do { auto __s = ( expr ); if( !__s.ok() ) CSP_THROW( RuntimeException, msg << ": " << __s.ToString() ); } while(0) + +// --- Native element writers (double, int64, bool) --- +// Use bulk AppendValues for a single resize + copy instead of per-element Append. + +template +void writeNativeElements( ArrowBuilderT * valueBuilder, PyArrayObject * pyArr, npy_intp len ) +{ + auto * data = reinterpret_cast( PyArray_DATA( pyArr ) ); + ARROW_OK_OR_THROW_WRITER( valueBuilder -> AppendValues( data, static_cast( len ) ), + "Failed to append list elements" ); +} + +// Bool specialization: numpy stores bools as uint8, BooleanBuilder::AppendValues accepts uint8 +template<> +inline void writeNativeElements( + ::arrow::BooleanBuilder * valueBuilder, PyArrayObject * pyArr, npy_intp len ) +{ + auto * data = reinterpret_cast( PyArray_DATA( pyArr ) ); + ARROW_OK_OR_THROW_WRITER( valueBuilder -> AppendValues( data, static_cast( len ) ), + "Failed to append bool list elements" ); +} + +// --- Native list writer --- + +template +class NativeListWriter final : public csp::adapters::arrow::FieldWriter +{ +public: + NativeListWriter( const std::string & columnName, const StructFieldPtr & structField, + std::shared_ptr<::arrow::DataType> elementType ) + : FieldWriter( { columnName }, { ::arrow::list( elementType ) }, structField ) + { + m_valueBuilder = std::make_shared(); + m_listBuilder = std::make_shared<::arrow::ListBuilder>( ::arrow::default_memory_pool(), m_valueBuilder ); + } + + void reserve( int64_t numRows ) override + { + ARROW_OK_OR_THROW_WRITER( m_listBuilder -> Reserve( numRows ), "Failed to reserve list builder" ); + } + + void writeNull() override + { + ARROW_OK_OR_THROW_WRITER( m_listBuilder -> AppendNull(), "Failed to append null list" ); + } + + std::vector> finish() override + { + std::shared_ptr<::arrow::Array> arr; + ARROW_OK_OR_THROW_WRITER( m_listBuilder -> Finish( &arr ), "Failed to finish list array" ); + return { arr }; + } + +protected: + void doWrite( const Struct * s ) override + { + ARROW_OK_OR_THROW_WRITER( m_listBuilder -> Append(), "Failed to start list" ); + auto & dgt = m_field -> value( s ); + auto * pyArr = reinterpret_cast( csp::python::toPythonBorrowed( dgt ) ); + npy_intp len = PyArray_SIZE( pyArr ); + + // writeNativeElements uses PyArray_DATA for bulk copy, which requires C-contiguous layout. + // Non-contiguous arrays (slices, transposes) must be copied to contiguous form first. + if( PyArray_IS_C_CONTIGUOUS( pyArr ) ) + { + writeNativeElements( m_valueBuilder.get(), pyArr, len ); + } + else + { + PyObjectPtr contiguousOwner{ PyObjectPtr::own( + reinterpret_cast( PyArray_GETCONTIGUOUS( pyArr ) ) ) }; + writeNativeElements( m_valueBuilder.get(), + reinterpret_cast( contiguousOwner.get() ), len ); + } + } + +private: + std::shared_ptr m_valueBuilder; + std::shared_ptr<::arrow::ListBuilder> m_listBuilder; +}; + +// --- String list writer --- + +class StringListWriter final : public csp::adapters::arrow::FieldWriter +{ +public: + StringListWriter( const std::string & columnName, const StructFieldPtr & structField ) + : FieldWriter( { columnName }, { ::arrow::list( ::arrow::utf8() ) }, structField ) + { + m_valueBuilder = std::make_shared<::arrow::StringBuilder>(); + m_listBuilder = std::make_shared<::arrow::ListBuilder>( ::arrow::default_memory_pool(), m_valueBuilder ); + } + + void reserve( int64_t numRows ) override + { + ARROW_OK_OR_THROW_WRITER( m_listBuilder -> Reserve( numRows ), "Failed to reserve string list builder" ); + } + + void writeNull() override + { + ARROW_OK_OR_THROW_WRITER( m_listBuilder -> AppendNull(), "Failed to append null list" ); + } + + std::vector> finish() override + { + std::shared_ptr<::arrow::Array> arr; + ARROW_OK_OR_THROW_WRITER( m_listBuilder -> Finish( &arr ), "Failed to finish string list array" ); + return { arr }; + } + +protected: + void doWrite( const Struct * s ) override + { + ARROW_OK_OR_THROW_WRITER( m_listBuilder -> Append(), "Failed to start list" ); + auto & dgt = m_field -> value( s ); + auto * pyArr = reinterpret_cast( csp::python::toPythonBorrowed( dgt ) ); + npy_intp len = PyArray_SIZE( pyArr ); + auto elementSize = PyDataType_ELSIZE( PyArray_DESCR( pyArr ) ); + auto charCount = elementSize / sizeof( char32_t ); + + for( npy_intp i = 0; i < len; ++i ) + { + auto * ptr = reinterpret_cast( PyArray_GETPTR1( pyArr, i ) ); + // Find actual string length (exclude trailing nulls) + size_t actualLen = 0; + for( size_t c = 0; c < charCount; ++c ) + { + if( ptr[c] == 0 ) break; + actualLen = c + 1; + } + m_utf8Buf = m_converter.to_bytes( ptr, ptr + actualLen ); + ARROW_OK_OR_THROW_WRITER( m_valueBuilder -> Append( m_utf8Buf ), "Failed to append string list element" ); + } + } + +private: + std::shared_ptr<::arrow::StringBuilder> m_valueBuilder; + std::shared_ptr<::arrow::ListBuilder> m_listBuilder; + std::wstring_convert, char32_t> m_converter; // reused across rows + std::string m_utf8Buf; // reused buffer +}; + +// --- Dispatch by npy_type --- + +inline std::unique_ptr dispatchListWriter( + const std::string & columnName, + const StructFieldPtr & structField, + int npyType ) +{ + switch( npyType ) + { + case NPY_DOUBLE: + return std::make_unique>( columnName, structField, ::arrow::float64() ); + case NPY_LONGLONG: + case NPY_LONG: + case NPY_INT: + return std::make_unique>( columnName, structField, ::arrow::int64() ); + case NPY_BOOL: + return std::make_unique>( columnName, structField, ::arrow::boolean() ); + case NPY_UNICODE: + return std::make_unique( columnName, structField ); + default: + CSP_THROW( TypeError, "Unsupported numpy type " << npyType << " for list column '" << columnName << "'" ); + } +} + +// --- NDArray writer (data column + dimensions column) --- + +class NumpyNDArrayWriter final : public csp::adapters::arrow::FieldWriter +{ +public: + NumpyNDArrayWriter( const std::string & columnName, const std::string & dimsColumnName, + const StructFieldPtr & structField, int npyType, + ShapeCallback shapeCallback ) + : FieldWriter( { columnName, dimsColumnName }, {}, structField ), + m_shapeCallback( std::move( shapeCallback ) ) + { + m_dataWriter = dispatchListWriter( columnName, structField, npyType ); + + m_dimsValueBuilder = std::make_shared<::arrow::Int64Builder>(); + m_dimsListBuilder = std::make_shared<::arrow::ListBuilder>( ::arrow::default_memory_pool(), m_dimsValueBuilder ); + + m_dataTypes = m_dataWriter -> dataTypes(); + m_dataTypes.push_back( ::arrow::list( ::arrow::int64() ) ); + } + + void reserve( int64_t numRows ) override + { + m_dataWriter -> reserve( numRows ); + ARROW_OK_OR_THROW_WRITER( m_dimsListBuilder -> Reserve( numRows ), "Failed to reserve dims list builder" ); + } + + void writeNull() override + { + m_dataWriter -> writeNull(); + ARROW_OK_OR_THROW_WRITER( m_dimsListBuilder -> AppendNull(), "Failed to append null dims" ); + } + + std::vector> finish() override + { + auto dataArrays = m_dataWriter -> finish(); + std::shared_ptr<::arrow::Array> dimsArr; + ARROW_OK_OR_THROW_WRITER( m_dimsListBuilder -> Finish( &dimsArr ), "Failed to finish dims array" ); + dataArrays.push_back( dimsArr ); + return dataArrays; + } + +protected: + void doWrite( const Struct * s ) override + { + // Write data: for C-contiguous NDArrays, the native writer handles flat data correctly + // We call writeNext on the inner data writer which will check isSet and delegate to its doWrite + m_dataWriter -> writeNext( s ); + + // Write shape/dims + auto & dgt = m_field -> value( s ); + auto shape = m_shapeCallback( dgt ); + + ARROW_OK_OR_THROW_WRITER( m_dimsListBuilder -> Append(), "Failed to start dims list" ); + ARROW_OK_OR_THROW_WRITER( + m_dimsValueBuilder -> AppendValues( shape.data(), static_cast( shape.size() ) ), + "Failed to append dim values" ); + } + +private: + ShapeCallback m_shapeCallback; + std::unique_ptr m_dataWriter; + std::shared_ptr<::arrow::Int64Builder> m_dimsValueBuilder; + std::shared_ptr<::arrow::ListBuilder> m_dimsListBuilder; +}; + +#undef ARROW_OK_OR_THROW_WRITER + +} // namespace numpy + +// Create a FieldWriter for a 1D numpy array field +inline std::unique_ptr createNumpyArrayWriter( + const std::string & columnName, + const StructFieldPtr & structField, + int npyType ) +{ + return numpy::dispatchListWriter( columnName, structField, npyType ); +} + +// Create a FieldWriter for an NDArray field (data column + dimensions column) +inline std::unique_ptr createNumpyNDArrayWriter( + const std::string & columnName, + const std::string & dimsColumnName, + const StructFieldPtr & structField, + int npyType, + ShapeCallback shapeCallback ) +{ + return std::make_unique( columnName, dimsColumnName, structField, npyType, std::move( shapeCallback ) ); +} + +} // namespace csp::python + +#endif diff --git a/cpp/csp/python/adapters/CMakeLists.txt b/cpp/csp/python/adapters/CMakeLists.txt index 62742a8b0..b8f6b35af 100644 --- a/cpp/csp/python/adapters/CMakeLists.txt +++ b/cpp/csp/python/adapters/CMakeLists.txt @@ -1,29 +1,6 @@ if(CSP_BUILD_ARROW_ADAPTER) - add_library(arrowadapterimpl SHARED PyArrowInputAdapter.cpp ArrowInputAdapter.h) - - if(WIN32) - if(CSP_USE_VCPKG) - set(ARROW_PACKAGES_TO_LINK Arrow::arrow_static) - target_compile_definitions(arrowadapterimpl PUBLIC ARROW_STATIC) - else() - # use dynamic variants - # Until we manage to get the fix for ws3_32.dll in arrow-16 into conda, manually fix the error here - get_target_property(LINK_LIBS Arrow::arrow_shared INTERFACE_LINK_LIBRARIES) - string(REPLACE "ws2_32.dll" "ws2_32" FIXED_LINK_LIBS "${LINK_LIBS}") - set_target_properties(Arrow::arrow_shared PROPERTIES INTERFACE_LINK_LIBRARIES "${FIXED_LINK_LIBS}") - set(ARROW_PACKAGES_TO_LINK arrow_shared) - endif() - else() - if(CSP_USE_VCPKG) - # use static variants - set(ARROW_PACKAGES_TO_LINK arrow_static) - else() - # use dynamic variants - set(ARROW_PACKAGES_TO_LINK arrow) - endif() - endif() - - target_link_libraries(arrowadapterimpl csp_core csp_engine cspimpl ${ARROW_PACKAGES_TO_LINK}) + add_library(arrowadapterimpl SHARED PyArrowInputAdapter.cpp ArrowCppNodes.cpp ArrowInputAdapter.h) + target_link_libraries(arrowadapterimpl csp_core csp_engine cspimpl csp_arrow_adapter ${CSP_ARROW_LINK_LIBS}) target_include_directories(arrowadapterimpl PUBLIC ${ARROW_INCLUDE_DIR}) install(TARGETS arrowadapterimpl RUNTIME DESTINATION ${CSP_RUNTIME_INSTALL_SUBDIR} ) endif() diff --git a/csp/adapters/arrow.py b/csp/adapters/arrow.py index e01dd7106..07b417494 100644 --- a/csp/adapters/arrow.py +++ b/csp/adapters/arrow.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Tuple +from typing import Dict, Iterable, List, Optional, Tuple, TypeVar import pyarrow as pa import pyarrow.parquet as pq @@ -6,17 +6,23 @@ import csp from csp.impl.types.tstype import ts +from csp.impl.types.typing_utils import CspTypingUtils from csp.impl.wiring import input_adapter_def +from csp.impl.wiring.node import _node_internal_use from csp.lib import _arrowadapterimpl __all__ = [ "CRecordBatchPullInputAdapter", "RecordBatchPullInputAdapter", + "record_batches_to_struct", + "struct_to_record_batches", "write_record_batches", ] _PYARROW_HAS_CONCAT_BATCHES = parse(pa.__version__) >= parse("19.0.0") +T = TypeVar("T") + CRecordBatchPullInputAdapter = input_adapter_def( "CRecordBatchPullInputAdapter", @@ -139,3 +145,174 @@ def write_record_batches( else: s_prev_batch += [batch] s_prev_batch_size += len(batch) + + +@_node_internal_use(cppimpl=_arrowadapterimpl.record_batches_to_struct) +def _record_batches_to_struct( + schema_ptr: object, + cls: "T", + properties: dict, + data: ts[object], +) -> ts[List["T"]]: + raise NotImplementedError("C++ implementation only") + return None + + +@csp.graph +def record_batches_to_struct( + data: ts[List[pa.RecordBatch]], + cls: "T", + field_map: Dict[str, str], + schema: pa.Schema, + numpy_dimensions_column_map: Optional[Dict[str, str]] = None, +) -> ts[List["T"]]: + """Convert ts[List[pa.RecordBatch]] into ts[List[T]] where T is a csp.Struct type. + + Args: + data: Timeseries of lists of Arrow RecordBatches + cls: Target csp.Struct type + field_map: Mapping of struct field name -> arrow column name. + schema: Arrow schema of the record batches (required). + numpy_dimensions_column_map: Optional mapping of arrow column name -> dimensions column name + for NumpyNDArray fields. If not provided for an NDArray field, defaults to + ``_csp_dimensions``. + + Returns: + Timeseries of lists of struct instances + """ + from csp.adapters.output_adapters.parquet import resolve_array_shape_column_name + + numpy_dimensions_column_map = numpy_dimensions_column_map or {} + + # Inspect struct metadata to separate scalar fields from numpy fields + meta_typed = cls.metadata(typed=True) + scalar_field_map = {} + numpy_fields = {} + numpy_dimension_names = {} + + for struct_field_name, arrow_col_name in field_map.items(): + field_typ = meta_typed[struct_field_name] + if CspTypingUtils.is_numpy_array_type(field_typ): + numpy_fields[arrow_col_name] = struct_field_name + + if CspTypingUtils.is_numpy_nd_array_type(field_typ): + dim_col_name = resolve_array_shape_column_name( + arrow_col_name, numpy_dimensions_column_map.get(arrow_col_name, None) + ) + numpy_dimension_names[arrow_col_name] = dim_col_name + else: + scalar_field_map[arrow_col_name] = struct_field_name + + # Build properties dict for the C++ node + properties = { + "field_map": scalar_field_map, + "numpy_fields": numpy_fields, + "numpy_dimension_names": numpy_dimension_names, + } + + # Export schema to PyCapsule + schema_capsule = schema.__arrow_c_schema__() + + # Convert RecordBatches to PyCapsule tuples for C++ consumption + c_data = csp.apply( + data, + lambda batches: [rb.__arrow_c_array__() for rb in batches], + object, + ) + + return _record_batches_to_struct(schema_capsule, cls, properties, c_data) + + +@_node_internal_use(cppimpl=_arrowadapterimpl.struct_to_record_batches) +def _struct_to_record_batches( + cls: "T", + properties: dict, + data: ts[List["T"]], +) -> ts[object]: + raise NotImplementedError("C++ implementation only") + return None + + +@csp.graph +def struct_to_record_batches( + data: ts[List["T"]], + cls: "T", + field_map: Optional[Dict[str, str]] = None, + numpy_dimensions_column_map: Optional[Dict[str, str]] = None, + max_batch_size: int = 65536, +) -> ts[List[pa.RecordBatch]]: + """Convert ts[List[T]] into ts[List[pa.RecordBatch]] where T is a csp.Struct type. + + Args: + data: Timeseries of lists of struct instances + cls: Source csp.Struct type + field_map: Mapping of struct field name -> arrow column name. + If None, all non-numpy fields are included with identity naming. + numpy_dimensions_column_map: Optional mapping of arrow column name -> dimensions column name + for NumpyNDArray fields. If not provided for an NDArray field, defaults to + ``_csp_dimensions``. + max_batch_size: Maximum number of rows per output RecordBatch. + Defaults to 65536. Set to 0 to disable chunking. + + Returns: + Timeseries of lists of RecordBatch + """ + from csp.adapters.output_adapters.parquet import resolve_array_shape_column_name + + numpy_dimensions_column_map = numpy_dimensions_column_map or {} + + # Inspect struct metadata to separate scalar fields from numpy fields + meta_typed = cls.metadata(typed=True) + scalar_field_map = {} + numpy_fields = {} # {col_name: field_name} + numpy_element_types = {} # {field_name: python_type} + numpy_dimension_names = {} # {col_name: dims_col_name} + + if field_map is not None: + for struct_field_name, arrow_col_name in field_map.items(): + field_typ = meta_typed[struct_field_name] + if CspTypingUtils.is_numpy_array_type(field_typ): + numpy_fields[arrow_col_name] = struct_field_name + elem_type = field_typ.__args__[0] + numpy_element_types[struct_field_name] = elem_type + + if CspTypingUtils.is_numpy_nd_array_type(field_typ): + dim_col_name = resolve_array_shape_column_name( + arrow_col_name, numpy_dimensions_column_map.get(arrow_col_name, None) + ) + numpy_dimension_names[arrow_col_name] = dim_col_name + else: + scalar_field_map[struct_field_name] = arrow_col_name + else: + # No field_map: auto-detect all fields + for struct_field_name, field_typ in meta_typed.items(): + if CspTypingUtils.is_numpy_array_type(field_typ): + arrow_col_name = struct_field_name + numpy_fields[arrow_col_name] = struct_field_name + elem_type = field_typ.__args__[0] + numpy_element_types[struct_field_name] = elem_type + + if CspTypingUtils.is_numpy_nd_array_type(field_typ): + dim_col_name = resolve_array_shape_column_name( + arrow_col_name, numpy_dimensions_column_map.get(arrow_col_name, None) + ) + numpy_dimension_names[arrow_col_name] = dim_col_name + + # Build properties dict for the C++ node + properties = { + "numpy_fields": numpy_fields, + "numpy_element_types": numpy_element_types, + "numpy_dimension_names": numpy_dimension_names, + "max_batch_size": max_batch_size, + } + if scalar_field_map or field_map is not None: + properties["field_map"] = scalar_field_map + + # Call C++ node, then convert capsule tuples -> RecordBatch + c_data = _struct_to_record_batches(cls, properties, data) + + return csp.apply( + c_data, + lambda c_tups: [pa.record_batch(_RecordBatchCSource(c_tup)) for c_tup in c_tups], + List[pa.RecordBatch], + ) diff --git a/csp/tests/adapters/test_arrow_record_batches.py b/csp/tests/adapters/test_arrow_record_batches.py new file mode 100644 index 000000000..63e67ab38 --- /dev/null +++ b/csp/tests/adapters/test_arrow_record_batches.py @@ -0,0 +1,2296 @@ +"""Tests for record_batches_to_struct and struct_to_record_batches. + +Covers all scalar types supported by the parquet adapter (bool, int, float, str, +datetime, date, time, timedelta, enum, bytes, nested struct), numpy 1D arrays +(float, int, str, bool), NDArrays, field mapping, null handling, multiple ticks, +round-trips, and error cases. +""" + +from datetime import date, datetime, time, timedelta + +import numpy as np +import pyarrow as pa +import pytest + +import csp +from csp.adapters.arrow import record_batches_to_struct, struct_to_record_batches +from csp.typing import Numpy1DArray, NumpyNDArray + +_STARTTIME = datetime(2020, 1, 1, 9, 0, 0) + + +# ===================================================================== +# Struct definitions +# ===================================================================== + + +class ScalarStruct(csp.Struct): + i64: int + f64: float + s: str + b: bool + + +class NumericOnlyStruct(csp.Struct): + x: int + y: float + + +class DateTimeStruct(csp.Struct): + dt: datetime + td: timedelta + d: date + t: time + + +class MyEnum(csp.Enum): + A = 1 + B = 2 + C = 3 + + +class EnumStruct(csp.Struct): + label: str + color: MyEnum + + +class BytesStruct(csp.Struct): + data: bytes + + +class InnerStruct(csp.Struct): + x: int + y: float + + +class NestedStruct(csp.Struct): + id: int + inner: InnerStruct + + +class AllTypesStruct(csp.Struct): + b: bool + i: int + d: float + dt: datetime + dte: date + t: time + td: timedelta + s: str + e: MyEnum + + +class NumpyStruct(csp.Struct): + id: int + values: Numpy1DArray[float] + + +class NumpyIntStruct(csp.Struct): + id: int + values: Numpy1DArray[int] + + +class NumpyStringStruct(csp.Struct): + id: int + names: Numpy1DArray[str] + + +class NumpyBoolStruct(csp.Struct): + id: int + flags: Numpy1DArray[bool] + + +class MixedStruct(csp.Struct): + label: str + scores: Numpy1DArray[float] + + +class NDArrayStruct(csp.Struct): + id: int + matrix: NumpyNDArray[float] + + +class NDArrayIntStruct(csp.Struct): + id: int + matrix: NumpyNDArray[int] + + +class FullMixedStruct(csp.Struct): + """Struct with scalar, numpy 1D, and NDArray fields.""" + + label: str + scores: Numpy1DArray[float] + matrix: NumpyNDArray[float] + + +# ===================================================================== +# Helpers — reader direction (batch → struct) +# ===================================================================== + + +def _run_to_struct(batches, cls, field_map, schema, numpy_dimensions_column_map=None): + """Run a graph that converts record batches to structs and returns the results.""" + + @csp.graph + def G( + batches_: object, + cls_: type, + field_map_: dict, + schema_: object, + numpy_dims_: object, + ): + data = csp.const([batches_]) + structs = record_batches_to_struct(data, cls_, field_map_, schema_, numpy_dims_) + csp.add_graph_output("structs", structs) + + results = csp.run( + G, + batches, + cls, + field_map, + schema, + numpy_dimensions_column_map, + starttime=_STARTTIME, + endtime=_STARTTIME + timedelta(seconds=1), + ) + assert len(results["structs"]) == 1 + return results["structs"][0][1] + + +def _run_multi_tick_read(tick_batches, cls, field_map, schema, numpy_dimensions_column_map=None): + """Run a graph that ticks multiple lists of record batches and returns all results.""" + + @csp.graph + def G( + ticks_: object, + cls_: type, + field_map_: dict, + schema_: object, + numpy_dims_: object, + ): + data = csp.unroll(csp.const(ticks_)) + structs = record_batches_to_struct(data, cls_, field_map_, schema_, numpy_dims_) + csp.add_graph_output("structs", structs) + + results = csp.run( + G, + tick_batches, + cls, + field_map, + schema, + numpy_dimensions_column_map, + starttime=_STARTTIME, + endtime=_STARTTIME + timedelta(seconds=len(tick_batches)), + ) + return [ts_val[1] for ts_val in results["structs"]] + + +# ===================================================================== +# Helpers — writer direction (struct → batch) +# ===================================================================== + + +def _run_to_batches(structs, cls, field_map=None, numpy_dimensions_column_map=None): + """Run a graph that converts structs to record batches and returns the results.""" + + @csp.graph + def G( + structs_: object, + cls_: type, + field_map_: object, + numpy_dims_: object, + ): + data = csp.const(structs_) + batches = struct_to_record_batches(data, cls_, field_map_, numpy_dims_) + csp.add_graph_output("batches", batches) + + results = csp.run( + G, + structs, + cls, + field_map, + numpy_dimensions_column_map, + starttime=_STARTTIME, + endtime=_STARTTIME + timedelta(seconds=1), + ) + assert len(results["batches"]) == 1 + return results["batches"][0][1] + + +def _run_multi_tick_write(tick_structs, cls, field_map=None, numpy_dimensions_column_map=None): + """Run a graph that ticks multiple lists of structs and returns all results.""" + + @csp.graph + def G( + ticks_: object, + cls_: type, + field_map_: object, + numpy_dims_: object, + ): + data = csp.unroll(csp.const(ticks_)) + batches = struct_to_record_batches(data, cls_, field_map_, numpy_dims_) + csp.add_graph_output("batches", batches) + + results = csp.run( + G, + tick_structs, + cls, + field_map, + numpy_dimensions_column_map, + starttime=_STARTTIME, + endtime=_STARTTIME + timedelta(seconds=len(tick_structs)), + ) + return [ts_val[1] for ts_val in results["batches"]] + + +# ===================================================================== +# Helpers — round-trip (struct → batch → struct, and batch → struct → batch) +# ===================================================================== + + +def _run_round_trip(structs, cls, field_map, schema): + """struct → batch → struct round-trip; returns result structs.""" + + @csp.graph + def G(s_: object, cls_: type, fm_: object, schema_: object): + data = csp.const(s_) + batches = struct_to_record_batches(data, cls_, fm_) + result = record_batches_to_struct(batches, cls_, fm_, schema_) + csp.add_graph_output("result", result) + + results = csp.run( + G, structs, cls, field_map, schema, starttime=_STARTTIME, endtime=_STARTTIME + timedelta(seconds=1) + ) + return results["result"][0][1] + + +def _run_reverse_round_trip(batch, cls, field_map): + """batch → struct → batch reverse round-trip; returns result batches.""" + schema = batch.schema + + @csp.graph + def G(b_: object, cls_: type, fm_: dict, schema_: object): + data = csp.const([b_]) + structs = record_batches_to_struct(data, cls_, fm_, schema_) + batches = struct_to_record_batches(structs, cls_, fm_) + csp.add_graph_output("result", batches) + + results = csp.run(G, batch, cls, field_map, schema, starttime=_STARTTIME, endtime=_STARTTIME + timedelta(seconds=1)) + return results["result"][0][1] + + +# ===================================================================== +# Tests: reading scalar fields (batch → struct) +# ===================================================================== + + +class TestReadScalarFields: + def test_basic_scalar_types(self): + batch = pa.RecordBatch.from_pydict( + { + "i64": [1, 2, 3], + "f64": [1.1, 2.2, 3.3], + "s": ["a", "b", "c"], + "b": [True, False, True], + } + ) + field_map = {"i64": "i64", "f64": "f64", "s": "s", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + + assert len(structs) == 3 + assert structs[0].i64 == 1 + assert structs[1].i64 == 2 + assert structs[2].i64 == 3 + assert structs[0].f64 == pytest.approx(1.1) + assert structs[1].f64 == pytest.approx(2.2) + assert structs[0].s == "a" + assert structs[1].s == "b" + assert structs[0].b is True + assert structs[1].b is False + + def test_field_mapping(self): + batch = pa.RecordBatch.from_pydict( + {"col_x": [10, 20], "col_y": [1.5, 2.5]}, + schema=pa.schema([("col_x", pa.int64()), ("col_y", pa.float64())]), + ) + field_map = {"x": "col_x", "y": "col_y"} + structs = _run_to_struct(batch, NumericOnlyStruct, field_map, batch.schema) + + assert len(structs) == 2 + assert structs[0].x == 10 + assert structs[0].y == pytest.approx(1.5) + assert structs[1].x == 20 + assert structs[1].y == pytest.approx(2.5) + + def test_single_row(self): + batch = pa.RecordBatch.from_pydict({"x": [42], "y": [3.14]}) + field_map = {"x": "x", "y": "y"} + structs = _run_to_struct(batch, NumericOnlyStruct, field_map, batch.schema) + + assert len(structs) == 1 + assert structs[0].x == 42 + assert structs[0].y == pytest.approx(3.14) + + def test_many_rows(self): + n = 1000 + batch = pa.RecordBatch.from_pydict({"x": list(range(n)), "y": [float(i) / 10.0 for i in range(n)]}) + field_map = {"x": "x", "y": "y"} + structs = _run_to_struct(batch, NumericOnlyStruct, field_map, batch.schema) + + assert len(structs) == n + for i in range(n): + assert structs[i].x == i + assert structs[i].y == pytest.approx(float(i) / 10.0) + + def test_multiple_batches_single_tick(self): + """Multiple record batches in a single tick should all be converted.""" + batch1 = pa.RecordBatch.from_pydict({"x": [1, 2], "y": [0.1, 0.2]}) + batch2 = pa.RecordBatch.from_pydict({"x": [3, 4], "y": [0.3, 0.4]}) + schema = batch1.schema + + @csp.graph + def G(): + data = csp.const([batch1, batch2]) + field_map = {"x": "x", "y": "y"} + structs = record_batches_to_struct(data, NumericOnlyStruct, field_map, schema) + csp.add_graph_output("structs", structs) + + results = csp.run(G, starttime=_STARTTIME, endtime=_STARTTIME + timedelta(seconds=1)) + structs = results["structs"][0][1] + + assert len(structs) == 4 + assert [s.x for s in structs] == [1, 2, 3, 4] + + def test_multiple_ticks(self): + """Multiple ticks each with their own batch.""" + batch1 = pa.RecordBatch.from_pydict({"x": [10], "y": [1.0]}) + batch2 = pa.RecordBatch.from_pydict({"x": [20], "y": [2.0]}) + schema = batch1.schema + + tick_batches = [[batch1], [batch2]] + all_results = _run_multi_tick_read(tick_batches, NumericOnlyStruct, {"x": "x", "y": "y"}, schema) + + assert len(all_results) == 2 + assert all_results[0][0].x == 10 + assert all_results[1][0].x == 20 + + def test_datetime_types(self): + """datetime, timedelta, date, time fields.""" + # Use a known UTC nanosecond value to avoid timezone ambiguity + # 2024-03-15T12:00:00 UTC = 1710504000 seconds since epoch + dt_val = datetime(2024, 3, 15, 12, 0, 0) + td_val = timedelta(seconds=3600) + d_val = date(2024, 6, 15) + t_val = time(14, 30, 0) + + # Construct nanosecond values directly (UTC epoch-based) + dt_ns = 1710504000 * 10**9 # 2024-03-15T12:00:00 UTC + td_ns = int(td_val.total_seconds() * 1e9) + d_days = (d_val - date(1970, 1, 1)).days + t_ns = (t_val.hour * 3600 + t_val.minute * 60 + t_val.second) * 10**9 + + batch = pa.RecordBatch.from_arrays( + [ + pa.array([dt_ns], type=pa.timestamp("ns", tz="UTC")), + pa.array([td_ns], type=pa.duration("ns")), + pa.array([d_days], type=pa.date32()), + pa.array([t_ns], type=pa.time64("ns")), + ], + schema=pa.schema( + [ + ("dt", pa.timestamp("ns", tz="UTC")), + ("td", pa.duration("ns")), + ("d", pa.date32()), + ("t", pa.time64("ns")), + ] + ), + ) + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + structs = _run_to_struct(batch, DateTimeStruct, field_map, batch.schema) + + assert len(structs) == 1 + assert structs[0].dt == dt_val + assert structs[0].td == td_val + assert structs[0].d == d_val + assert structs[0].t == t_val + + def test_enum_from_string(self): + """Enum fields stored as strings.""" + batch = pa.RecordBatch.from_pydict( + {"label": ["x", "y"], "color": ["A", "B"]}, + ) + field_map = {"label": "label", "color": "color"} + structs = _run_to_struct(batch, EnumStruct, field_map, batch.schema) + + assert len(structs) == 2 + assert structs[0].label == "x" + assert structs[0].color == MyEnum.A + assert structs[1].color == MyEnum.B + + def test_bytes_read(self): + """Binary/bytes field.""" + val = b"my\x00value" + batch = pa.RecordBatch.from_arrays( + [pa.array([val], type=pa.binary())], + schema=pa.schema([("data", pa.binary())]), + ) + field_map = {"data": "data"} + structs = _run_to_struct(batch, BytesStruct, field_map, batch.schema) + + assert len(structs) == 1 + assert structs[0].data == val + + def test_nested_struct_read(self): + """Nested struct field.""" + inner_type = pa.struct([("x", pa.int64()), ("y", pa.float64())]) + inner_arr = pa.StructArray.from_arrays( + [pa.array([42]), pa.array([2.5])], + fields=[pa.field("x", pa.int64()), pa.field("y", pa.float64())], + ) + batch = pa.RecordBatch.from_arrays( + [pa.array([1]), inner_arr], + schema=pa.schema([("id", pa.int64()), ("inner", inner_type)]), + ) + field_map = {"id": "id", "inner": "inner"} + structs = _run_to_struct(batch, NestedStruct, field_map, batch.schema) + + assert len(structs) == 1 + assert structs[0].id == 1 + assert structs[0].inner.x == 42 + assert structs[0].inner.y == pytest.approx(2.5) + + +# ===================================================================== +# Tests: reading numpy 1D array fields +# ===================================================================== + + +class TestReadNumpy1DFields: + def test_float_array(self): + batch = pa.RecordBatch.from_pydict( + {"id": [1, 2], "values": [[1.0, 2.0, 3.0], [4.0, 5.0]]}, + schema=pa.schema([("id", pa.int64()), ("values", pa.list_(pa.float64()))]), + ) + field_map = {"id": "id", "values": "values"} + structs = _run_to_struct(batch, NumpyStruct, field_map, batch.schema) + + assert len(structs) == 2 + assert structs[0].id == 1 + np.testing.assert_array_almost_equal(structs[0].values, [1.0, 2.0, 3.0]) + assert structs[1].id == 2 + np.testing.assert_array_almost_equal(structs[1].values, [4.0, 5.0]) + + def test_int_array(self): + batch = pa.RecordBatch.from_pydict( + {"id": [1], "values": [[10, 20, 30]]}, + schema=pa.schema([("id", pa.int64()), ("values", pa.list_(pa.int64()))]), + ) + field_map = {"id": "id", "values": "values"} + structs = _run_to_struct(batch, NumpyIntStruct, field_map, batch.schema) + + assert len(structs) == 1 + np.testing.assert_array_equal(structs[0].values, [10, 20, 30]) + + def test_string_array(self): + batch = pa.RecordBatch.from_pydict( + {"id": [1], "names": [["alice", "bob", "carol"]]}, + schema=pa.schema([("id", pa.int64()), ("names", pa.list_(pa.utf8()))]), + ) + field_map = {"id": "id", "names": "names"} + structs = _run_to_struct(batch, NumpyStringStruct, field_map, batch.schema) + + assert len(structs) == 1 + np.testing.assert_array_equal(structs[0].names, ["alice", "bob", "carol"]) + + def test_bool_array(self): + batch = pa.RecordBatch.from_pydict( + {"id": [1], "flags": [[True, False, True]]}, + schema=pa.schema([("id", pa.int64()), ("flags", pa.list_(pa.bool_()))]), + ) + field_map = {"id": "id", "flags": "flags"} + structs = _run_to_struct(batch, NumpyBoolStruct, field_map, batch.schema) + + assert len(structs) == 1 + np.testing.assert_array_equal(structs[0].flags, [True, False, True]) + + def test_empty_list(self): + batch = pa.RecordBatch.from_pydict( + {"id": [1], "values": [[]]}, + schema=pa.schema([("id", pa.int64()), ("values", pa.list_(pa.float64()))]), + ) + field_map = {"id": "id", "values": "values"} + structs = _run_to_struct(batch, NumpyStruct, field_map, batch.schema) + + assert len(structs) == 1 + assert structs[0].id == 1 + assert len(structs[0].values) == 0 + + def test_null_list_cell(self): + """A null list cell should leave the struct field unset.""" + arr_id = pa.array([1, 2]) + arr_values = pa.array([[1.0, 2.0], None], type=pa.list_(pa.float64())) + batch = pa.RecordBatch.from_arrays( + [arr_id, arr_values], + schema=pa.schema([("id", pa.int64()), ("values", pa.list_(pa.float64()))]), + ) + field_map = {"id": "id", "values": "values"} + structs = _run_to_struct(batch, NumpyStruct, field_map, batch.schema) + + assert len(structs) == 2 + assert structs[0].id == 1 + np.testing.assert_array_almost_equal(structs[0].values, [1.0, 2.0]) + assert structs[1].id == 2 + assert not hasattr(structs[1], "values") + + def test_mixed_scalar_and_numpy(self): + batch = pa.RecordBatch.from_pydict( + {"label": ["a", "b"], "scores": [[0.1, 0.2], [0.3, 0.4, 0.5]]}, + schema=pa.schema([("label", pa.utf8()), ("scores", pa.list_(pa.float64()))]), + ) + field_map = {"label": "label", "scores": "scores"} + structs = _run_to_struct(batch, MixedStruct, field_map, batch.schema) + + assert len(structs) == 2 + assert structs[0].label == "a" + np.testing.assert_array_almost_equal(structs[0].scores, [0.1, 0.2]) + assert structs[1].label == "b" + np.testing.assert_array_almost_equal(structs[1].scores, [0.3, 0.4, 0.5]) + + def test_nan_in_float_list(self): + """Null values in float lists should become NaN.""" + arr_values = pa.array([[1.0, None, 3.0]], type=pa.list_(pa.float64())) + arr_id = pa.array([1]) + batch = pa.RecordBatch.from_arrays( + [arr_id, arr_values], + schema=pa.schema([("id", pa.int64()), ("values", pa.list_(pa.float64()))]), + ) + field_map = {"id": "id", "values": "values"} + structs = _run_to_struct(batch, NumpyStruct, field_map, batch.schema) + + assert len(structs) == 1 + assert structs[0].values[0] == pytest.approx(1.0) + assert np.isnan(structs[0].values[1]) + assert structs[0].values[2] == pytest.approx(3.0) + + +# ===================================================================== +# Tests: reading NDArray fields +# ===================================================================== + + +class TestReadNumpyNDArrayFields: + def test_2d_reshape(self): + batch = pa.RecordBatch.from_pydict( + { + "id": [1], + "matrix": [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]], + "matrix_csp_dimensions": [[2, 3]], + }, + schema=pa.schema( + [ + ("id", pa.int64()), + ("matrix", pa.list_(pa.float64())), + ("matrix_csp_dimensions", pa.list_(pa.int64())), + ] + ), + ) + field_map = {"id": "id", "matrix": "matrix"} + structs = _run_to_struct(batch, NDArrayStruct, field_map, batch.schema) + + assert len(structs) == 1 + assert structs[0].id == 1 + expected = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + np.testing.assert_array_almost_equal(structs[0].matrix, expected) + assert structs[0].matrix.shape == (2, 3) + + def test_3d_reshape(self): + flat_data = list(range(24)) + batch = pa.RecordBatch.from_pydict( + { + "id": [1], + "matrix": [[float(x) for x in flat_data]], + "matrix_csp_dimensions": [[2, 3, 4]], + }, + schema=pa.schema( + [ + ("id", pa.int64()), + ("matrix", pa.list_(pa.float64())), + ("matrix_csp_dimensions", pa.list_(pa.int64())), + ] + ), + ) + field_map = {"id": "id", "matrix": "matrix"} + structs = _run_to_struct(batch, NDArrayStruct, field_map, batch.schema) + + assert len(structs) == 1 + expected = np.arange(24, dtype=float).reshape(2, 3, 4) + np.testing.assert_array_almost_equal(structs[0].matrix, expected) + assert structs[0].matrix.shape == (2, 3, 4) + + def test_custom_dims_column_name(self): + batch = pa.RecordBatch.from_pydict( + { + "id": [1], + "matrix": [[1.0, 2.0, 3.0, 4.0]], + "my_dims": [[2, 2]], + }, + schema=pa.schema( + [ + ("id", pa.int64()), + ("matrix", pa.list_(pa.float64())), + ("my_dims", pa.list_(pa.int64())), + ] + ), + ) + field_map = {"id": "id", "matrix": "matrix"} + numpy_dims = {"matrix": "my_dims"} + structs = _run_to_struct(batch, NDArrayStruct, field_map, batch.schema, numpy_dims) + + assert len(structs) == 1 + expected = np.array([[1.0, 2.0], [3.0, 4.0]]) + np.testing.assert_array_almost_equal(structs[0].matrix, expected) + + def test_multiple_rows_with_different_shapes(self): + batch = pa.RecordBatch.from_pydict( + { + "id": [1, 2], + "matrix": [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], [10.0, 20.0, 30.0, 40.0]], + "matrix_csp_dimensions": [[2, 3], [2, 2]], + }, + schema=pa.schema( + [ + ("id", pa.int64()), + ("matrix", pa.list_(pa.float64())), + ("matrix_csp_dimensions", pa.list_(pa.int64())), + ] + ), + ) + field_map = {"id": "id", "matrix": "matrix"} + structs = _run_to_struct(batch, NDArrayStruct, field_map, batch.schema) + + assert len(structs) == 2 + np.testing.assert_array_almost_equal(structs[0].matrix, np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])) + assert structs[0].matrix.shape == (2, 3) + np.testing.assert_array_almost_equal(structs[1].matrix, np.array([[10.0, 20.0], [30.0, 40.0]])) + assert structs[1].matrix.shape == (2, 2) + + def test_dims_with_int32(self): + """Dimensions column with int32 type should also work.""" + batch = pa.RecordBatch.from_pydict( + { + "id": [1], + "matrix": [[1.0, 2.0, 3.0, 4.0]], + "matrix_csp_dimensions": pa.array([[2, 2]], type=pa.list_(pa.int32())), + }, + schema=pa.schema( + [ + ("id", pa.int64()), + ("matrix", pa.list_(pa.float64())), + ("matrix_csp_dimensions", pa.list_(pa.int32())), + ] + ), + ) + field_map = {"id": "id", "matrix": "matrix"} + structs = _run_to_struct(batch, NDArrayStruct, field_map, batch.schema) + + assert len(structs) == 1 + assert structs[0].matrix.shape == (2, 2) + + +# ===================================================================== +# Tests: writing scalar fields (struct → batch) +# ===================================================================== + + +class TestWriteScalarFields: + def test_basic_scalar_types(self): + structs = [ + ScalarStruct(i64=1, f64=1.1, s="a", b=True), + ScalarStruct(i64=2, f64=2.2, s="b", b=False), + ScalarStruct(i64=3, f64=3.3, s="c", b=True), + ] + field_map = {"i64": "i64", "f64": "f64", "s": "s", "b": "b"} + batches = _run_to_batches(structs, ScalarStruct, field_map) + + assert len(batches) == 1 + batch = batches[0] + assert batch.num_rows == 3 + assert batch.column("i64").to_pylist() == [1, 2, 3] + assert batch.column("f64").to_pylist() == pytest.approx([1.1, 2.2, 3.3]) + assert batch.column("s").to_pylist() == ["a", "b", "c"] + assert batch.column("b").to_pylist() == [True, False, True] + + def test_field_mapping(self): + structs = [ + NumericOnlyStruct(x=10, y=1.5), + NumericOnlyStruct(x=20, y=2.5), + ] + field_map = {"x": "col_x", "y": "col_y"} + batches = _run_to_batches(structs, NumericOnlyStruct, field_map) + + batch = batches[0] + assert batch.num_rows == 2 + assert batch.column("col_x").to_pylist() == [10, 20] + assert batch.column("col_y").to_pylist() == pytest.approx([1.5, 2.5]) + + def test_single_row(self): + structs = [NumericOnlyStruct(x=42, y=3.14)] + field_map = {"x": "x", "y": "y"} + batches = _run_to_batches(structs, NumericOnlyStruct, field_map) + + batch = batches[0] + assert batch.num_rows == 1 + assert batch.column("x").to_pylist() == [42] + assert batch.column("y").to_pylist() == pytest.approx([3.14]) + + def test_many_rows(self): + n = 1000 + structs = [NumericOnlyStruct(x=i, y=float(i) / 10.0) for i in range(n)] + field_map = {"x": "x", "y": "y"} + batches = _run_to_batches(structs, NumericOnlyStruct, field_map) + + batch = batches[0] + assert batch.num_rows == n + assert batch.column("x").to_pylist() == list(range(n)) + + def test_null_unset_fields(self): + """Unset struct fields should become null in Arrow.""" + s1 = ScalarStruct(i64=1, f64=1.1) + # s and b are unset + field_map = {"i64": "i64", "f64": "f64", "s": "s", "b": "b"} + batches = _run_to_batches([s1], ScalarStruct, field_map) + + batch = batches[0] + assert batch.column("i64").to_pylist() == [1] + assert batch.column("s").to_pylist() == [None] + assert batch.column("b").to_pylist() == [None] + + def test_multiple_ticks(self): + tick1 = [NumericOnlyStruct(x=10, y=1.0)] + tick2 = [NumericOnlyStruct(x=20, y=2.0)] + + field_map = {"x": "x", "y": "y"} + all_results = _run_multi_tick_write([tick1, tick2], NumericOnlyStruct, field_map) + + assert len(all_results) == 2 + assert all_results[0][0].column("x").to_pylist() == [10] + assert all_results[1][0].column("x").to_pylist() == [20] + + def test_no_field_map(self): + """No field_map: auto-include all non-numpy fields with identity naming.""" + structs = [NumericOnlyStruct(x=5, y=2.5)] + batches = _run_to_batches(structs, NumericOnlyStruct) + + batch = batches[0] + assert batch.num_rows == 1 + assert batch.column("x").to_pylist() == [5] + assert batch.column("y").to_pylist() == pytest.approx([2.5]) + + def test_datetime_types(self): + """datetime, timedelta, date, time fields.""" + dt_val = datetime(2024, 3, 15, 12, 0, 0) + td_val = timedelta(seconds=3600) + d_val = date(2024, 6, 15) + t_val = time(14, 30, 0) + + structs = [DateTimeStruct(dt=dt_val, td=td_val, d=d_val, t=t_val)] + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + batches = _run_to_batches(structs, DateTimeStruct, field_map) + + batch = batches[0] + assert batch.num_rows == 1 + # Verify the arrow types + assert pa.types.is_timestamp(batch.schema.field("dt").type) + assert pa.types.is_duration(batch.schema.field("td").type) + assert pa.types.is_date32(batch.schema.field("d").type) + assert pa.types.is_time64(batch.schema.field("t").type) + + def test_enum_write(self): + """Enum fields written as strings.""" + structs = [ + EnumStruct(label="x", color=MyEnum.A), + EnumStruct(label="y", color=MyEnum.B), + ] + field_map = {"label": "label", "color": "color"} + batches = _run_to_batches(structs, EnumStruct, field_map) + + batch = batches[0] + assert batch.column("color").to_pylist() == ["A", "B"] + assert batch.column("label").to_pylist() == ["x", "y"] + + def test_bytes_write(self): + """Binary/bytes field.""" + val = b"my\x00value" + structs = [BytesStruct(data=val)] + field_map = {"data": "data"} + batches = _run_to_batches(structs, BytesStruct, field_map) + + batch = batches[0] + assert batch.column("data").to_pylist() == [val] + + def test_nested_struct_write(self): + """Nested struct field.""" + inner = InnerStruct(x=42, y=2.5) + structs = [NestedStruct(id=1, inner=inner)] + field_map = {"id": "id", "inner": "inner"} + batches = _run_to_batches(structs, NestedStruct, field_map) + + batch = batches[0] + assert batch.num_rows == 1 + assert batch.column("id").to_pylist() == [1] + inner_col = batch.column("inner") + assert inner_col.to_pylist() == [{"x": 42, "y": 2.5}] + + +# ===================================================================== +# Tests: writing numpy 1D array fields +# ===================================================================== + + +class TestWriteNumpy1DFields: + def test_float_array(self): + structs = [ + NumpyStruct(id=1, values=np.array([1.0, 2.0, 3.0])), + NumpyStruct(id=2, values=np.array([4.0, 5.0])), + ] + field_map = {"id": "id", "values": "values"} + batches = _run_to_batches(structs, NumpyStruct, field_map) + + batch = batches[0] + assert batch.num_rows == 2 + assert batch.column("id").to_pylist() == [1, 2] + vals = batch.column("values").to_pylist() + assert vals[0] == pytest.approx([1.0, 2.0, 3.0]) + assert vals[1] == pytest.approx([4.0, 5.0]) + + def test_int_array(self): + structs = [NumpyIntStruct(id=1, values=np.array([10, 20, 30], dtype=np.int64))] + field_map = {"id": "id", "values": "values"} + batches = _run_to_batches(structs, NumpyIntStruct, field_map) + + batch = batches[0] + assert batch.column("values").to_pylist() == [[10, 20, 30]] + + def test_string_array(self): + structs = [NumpyStringStruct(id=1, names=np.array(["alice", "bob", "carol"]))] + field_map = {"id": "id", "names": "names"} + batches = _run_to_batches(structs, NumpyStringStruct, field_map) + + batch = batches[0] + assert batch.column("names").to_pylist() == [["alice", "bob", "carol"]] + + def test_bool_array(self): + structs = [NumpyBoolStruct(id=1, flags=np.array([True, False, True]))] + field_map = {"id": "id", "flags": "flags"} + batches = _run_to_batches(structs, NumpyBoolStruct, field_map) + + batch = batches[0] + assert batch.column("flags").to_pylist() == [[True, False, True]] + + def test_empty_array(self): + structs = [NumpyStruct(id=1, values=np.array([], dtype=np.float64))] + field_map = {"id": "id", "values": "values"} + batches = _run_to_batches(structs, NumpyStruct, field_map) + + batch = batches[0] + assert batch.column("values").to_pylist() == [[]] + + def test_null_numpy_field(self): + """Unset numpy field should become null in Arrow.""" + structs = [NumpyStruct(id=1)] # values is unset + field_map = {"id": "id", "values": "values"} + batches = _run_to_batches(structs, NumpyStruct, field_map) + + batch = batches[0] + assert batch.column("id").to_pylist() == [1] + assert batch.column("values").to_pylist() == [None] + + def test_mixed_scalar_and_numpy(self): + structs = [ + MixedStruct(label="a", scores=np.array([0.1, 0.2])), + MixedStruct(label="b", scores=np.array([0.3, 0.4, 0.5])), + ] + field_map = {"label": "label", "scores": "scores"} + batches = _run_to_batches(structs, MixedStruct, field_map) + + batch = batches[0] + assert batch.column("label").to_pylist() == ["a", "b"] + vals = batch.column("scores").to_pylist() + assert vals[0] == pytest.approx([0.1, 0.2]) + assert vals[1] == pytest.approx([0.3, 0.4, 0.5]) + + +# ===================================================================== +# Tests: writing NDArray fields +# ===================================================================== + + +class TestWriteNumpyNDArrayFields: + def test_2d_array(self): + matrix = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + structs = [NDArrayStruct(id=1, matrix=matrix)] + field_map = {"id": "id", "matrix": "matrix"} + batches = _run_to_batches(structs, NDArrayStruct, field_map) + + batch = batches[0] + assert batch.num_rows == 1 + data_col = batch.column("matrix").to_pylist() + assert data_col[0] == pytest.approx([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + dims_col = batch.column("matrix_csp_dimensions").to_pylist() + assert dims_col[0] == [2, 3] + + def test_3d_array(self): + matrix = np.arange(24, dtype=float).reshape(2, 3, 4) + structs = [NDArrayStruct(id=1, matrix=matrix)] + field_map = {"id": "id", "matrix": "matrix"} + batches = _run_to_batches(structs, NDArrayStruct, field_map) + + batch = batches[0] + data_col = batch.column("matrix").to_pylist() + assert data_col[0] == pytest.approx(list(range(24))) + dims_col = batch.column("matrix_csp_dimensions").to_pylist() + assert dims_col[0] == [2, 3, 4] + + def test_custom_dims_column_name(self): + matrix = np.array([[1.0, 2.0], [3.0, 4.0]]) + structs = [NDArrayStruct(id=1, matrix=matrix)] + field_map = {"id": "id", "matrix": "matrix"} + numpy_dims = {"matrix": "my_dims"} + batches = _run_to_batches(structs, NDArrayStruct, field_map, numpy_dims) + + batch = batches[0] + assert "my_dims" in batch.schema.names + dims_col = batch.column("my_dims").to_pylist() + assert dims_col[0] == [2, 2] + + def test_multiple_rows_different_shapes(self): + m1 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + m2 = np.array([[10.0, 20.0], [30.0, 40.0]]) + structs = [ + NDArrayStruct(id=1, matrix=m1), + NDArrayStruct(id=2, matrix=m2), + ] + field_map = {"id": "id", "matrix": "matrix"} + batches = _run_to_batches(structs, NDArrayStruct, field_map) + + batch = batches[0] + assert batch.num_rows == 2 + data_col = batch.column("matrix").to_pylist() + assert data_col[0] == pytest.approx([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + assert data_col[1] == pytest.approx([10.0, 20.0, 30.0, 40.0]) + dims_col = batch.column("matrix_csp_dimensions").to_pylist() + assert dims_col[0] == [2, 3] + assert dims_col[1] == [2, 2] + + +# ===================================================================== +# Tests: round-trip (struct → batch → struct) +# ===================================================================== + + +class TestRoundTrip: + def test_scalar_round_trip(self): + structs = [ + ScalarStruct(i64=1, f64=1.1, s="hello", b=True), + ScalarStruct(i64=2, f64=2.2, s="world", b=False), + ] + field_map = {"i64": "i64", "f64": "f64", "s": "s", "b": "b"} + schema = pa.schema([("i64", pa.int64()), ("f64", pa.float64()), ("s", pa.utf8()), ("b", pa.bool_())]) + result = _run_round_trip(structs, ScalarStruct, field_map, schema) + + assert len(result) == 2 + assert result[0].i64 == 1 + assert result[0].f64 == pytest.approx(1.1) + assert result[0].s == "hello" + assert result[0].b is True + assert result[1].i64 == 2 + assert result[1].s == "world" + + def test_datetime_round_trip(self): + """Round-trip for datetime, timedelta, date, time.""" + dt_val = datetime(2024, 3, 15, 12, 0, 0) + td_val = timedelta(seconds=3600) + d_val = date(2024, 6, 15) + t_val = time(14, 30, 0) + + structs = [DateTimeStruct(dt=dt_val, td=td_val, d=d_val, t=t_val)] + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + schema = pa.schema( + [ + ("dt", pa.timestamp("ns", tz="UTC")), + ("td", pa.duration("ns")), + ("d", pa.date32()), + ("t", pa.time64("ns")), + ] + ) + result = _run_round_trip(structs, DateTimeStruct, field_map, schema) + + assert len(result) == 1 + assert result[0].dt == dt_val + assert result[0].td == td_val + assert result[0].d == d_val + assert result[0].t == t_val + + def test_enum_round_trip(self): + """Round-trip for enum fields.""" + structs = [ + EnumStruct(label="x", color=MyEnum.A), + EnumStruct(label="y", color=MyEnum.C), + ] + field_map = {"label": "label", "color": "color"} + schema = pa.schema([("label", pa.utf8()), ("color", pa.utf8())]) + result = _run_round_trip(structs, EnumStruct, field_map, schema) + + assert len(result) == 2 + assert result[0].label == "x" + assert result[0].color == MyEnum.A + assert result[1].color == MyEnum.C + + def test_bytes_round_trip(self): + """Round-trip for bytes field.""" + val = b"my\x00value" + structs = [BytesStruct(data=val)] + field_map = {"data": "data"} + schema = pa.schema([("data", pa.binary())]) + result = _run_round_trip(structs, BytesStruct, field_map, schema) + + assert len(result) == 1 + assert result[0].data == val + + def test_nested_struct_round_trip(self): + """Round-trip for nested struct.""" + inner = InnerStruct(x=42, y=2.5) + structs = [NestedStruct(id=1, inner=inner)] + field_map = {"id": "id", "inner": "inner"} + schema = pa.schema([("id", pa.int64()), ("inner", pa.struct([("x", pa.int64()), ("y", pa.float64())]))]) + result = _run_round_trip(structs, NestedStruct, field_map, schema) + + assert len(result) == 1 + assert result[0].id == 1 + assert result[0].inner.x == 42 + assert result[0].inner.y == pytest.approx(2.5) + + def test_numpy_round_trip(self): + structs = [ + NumpyStruct(id=1, values=np.array([1.0, 2.0, 3.0])), + NumpyStruct(id=2, values=np.array([4.0, 5.0])), + ] + field_map = {"id": "id", "values": "values"} + schema = pa.schema([("id", pa.int64()), ("values", pa.list_(pa.float64()))]) + result = _run_round_trip(structs, NumpyStruct, field_map, schema) + + assert len(result) == 2 + assert result[0].id == 1 + np.testing.assert_array_almost_equal(result[0].values, [1.0, 2.0, 3.0]) + assert result[1].id == 2 + np.testing.assert_array_almost_equal(result[1].values, [4.0, 5.0]) + + def test_ndarray_round_trip(self): + matrix = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + structs = [NDArrayStruct(id=1, matrix=matrix)] + field_map = {"id": "id", "matrix": "matrix"} + schema = pa.schema( + [("id", pa.int64()), ("matrix", pa.list_(pa.float64())), ("matrix_csp_dimensions", pa.list_(pa.int64()))] + ) + result = _run_round_trip(structs, NDArrayStruct, field_map, schema) + + assert len(result) == 1 + assert result[0].id == 1 + np.testing.assert_array_almost_equal(result[0].matrix, matrix) + assert result[0].matrix.shape == (2, 3) + + def test_all_types_round_trip(self): + """Round-trip for all supported scalar types.""" + structs = [ + AllTypesStruct( + b=True, + i=123, + d=123.456, + dt=datetime(2024, 1, 1, 12, 0, 0), + dte=date(2024, 6, 15), + t=time(14, 30, 0), + td=timedelta(seconds=3600, milliseconds=123), + s="hello", + e=MyEnum.A, + ), + AllTypesStruct( + b=False, + i=456, + d=789.012, + dt=datetime(2024, 6, 15, 0, 0, 0), + dte=date(2025, 1, 1), + t=time(0, 0, 0), + td=timedelta(seconds=0), + s="world", + e=MyEnum.B, + ), + ] + field_map = {k: k for k in AllTypesStruct.metadata().keys()} + schema = pa.schema( + [ + ("b", pa.bool_()), + ("i", pa.int64()), + ("d", pa.float64()), + ("dt", pa.timestamp("ns", tz="UTC")), + ("dte", pa.date32()), + ("t", pa.time64("ns")), + ("td", pa.duration("ns")), + ("s", pa.utf8()), + ("e", pa.utf8()), + ] + ) + result = _run_round_trip(structs, AllTypesStruct, field_map, schema) + + assert len(result) == 2 + assert result[0].b is True + assert result[0].i == 123 + assert result[0].d == pytest.approx(123.456) + assert result[0].dt == datetime(2024, 1, 1, 12, 0, 0) + assert result[0].dte == date(2024, 6, 15) + assert result[0].t == time(14, 30, 0) + assert result[0].td == timedelta(seconds=3600, milliseconds=123) + assert result[0].s == "hello" + assert result[0].e == MyEnum.A + assert result[1].b is False + assert result[1].i == 456 + assert result[1].s == "world" + assert result[1].e == MyEnum.B + + +# ===================================================================== +# Tests: reverse round-trip (batch → struct → batch) +# ===================================================================== + + +class TestReverseRoundTrip: + """Convert batch → struct → batch and verify the result matches the original.""" + + def test_scalar_batch_to_struct_to_batch(self): + """batch → struct → batch for basic scalar types.""" + original = pa.RecordBatch.from_pydict( + {"i64": [1, 2, 3], "f64": [1.1, 2.2, 3.3], "s": ["a", "b", "c"], "b": [True, False, True]} + ) + field_map = {"i64": "i64", "f64": "f64", "s": "s", "b": "b"} + result_batches = _run_reverse_round_trip(original, ScalarStruct, field_map) + + assert len(result_batches) == 1 + result = result_batches[0] + assert result.num_rows == 3 + assert result.column("i64").to_pylist() == [1, 2, 3] + assert result.column("f64").to_pylist() == pytest.approx([1.1, 2.2, 3.3]) + assert result.column("s").to_pylist() == ["a", "b", "c"] + assert result.column("b").to_pylist() == [True, False, True] + + def test_numpy_batch_to_struct_to_batch(self): + """batch → struct → batch for numpy 1D arrays.""" + original = pa.RecordBatch.from_pydict( + {"id": [1, 2], "values": [[1.0, 2.0, 3.0], [4.0, 5.0]]}, + schema=pa.schema([("id", pa.int64()), ("values", pa.list_(pa.float64()))]), + ) + field_map = {"id": "id", "values": "values"} + result_batches = _run_reverse_round_trip(original, NumpyStruct, field_map) + + assert len(result_batches) == 1 + result = result_batches[0] + assert result.num_rows == 2 + assert result.column("id").to_pylist() == [1, 2] + vals = result.column("values").to_pylist() + assert vals[0] == pytest.approx([1.0, 2.0, 3.0]) + assert vals[1] == pytest.approx([4.0, 5.0]) + + def test_ndarray_batch_to_struct_to_batch(self): + """batch → struct → batch for NDArrays.""" + original = pa.RecordBatch.from_pydict( + {"id": [1], "matrix": [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]], "matrix_csp_dimensions": [[2, 3]]}, + schema=pa.schema( + [ + ("id", pa.int64()), + ("matrix", pa.list_(pa.float64())), + ("matrix_csp_dimensions", pa.list_(pa.int64())), + ] + ), + ) + field_map = {"id": "id", "matrix": "matrix"} + result_batches = _run_reverse_round_trip(original, NDArrayStruct, field_map) + + assert len(result_batches) == 1 + result = result_batches[0] + assert result.num_rows == 1 + assert result.column("matrix").to_pylist()[0] == pytest.approx([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + assert result.column("matrix_csp_dimensions").to_pylist()[0] == [2, 3] + + def test_all_types_batch_to_struct_to_batch(self): + """batch → struct → batch for all scalar types.""" + dt_val = datetime(2024, 3, 15, 12, 0, 0) + td_val = timedelta(seconds=3600) + d_val = date(2024, 6, 15) + t_val = time(14, 30, 0) + + # Use known UTC nanosecond values to avoid timezone ambiguity + dt_ns = 1710504000 * 10**9 # 2024-03-15T12:00:00 UTC + td_ns = int(td_val.total_seconds() * 1e9) + d_days = (d_val - date(1970, 1, 1)).days + t_ns = (t_val.hour * 3600 + t_val.minute * 60 + t_val.second) * 10**9 + + original = pa.RecordBatch.from_arrays( + [ + pa.array([True], type=pa.bool_()), + pa.array([123], type=pa.int64()), + pa.array([123.456], type=pa.float64()), + pa.array([dt_ns], type=pa.timestamp("ns", tz="UTC")), + pa.array([d_days], type=pa.date32()), + pa.array([t_ns], type=pa.time64("ns")), + pa.array([td_ns], type=pa.duration("ns")), + pa.array(["hello"], type=pa.utf8()), + pa.array(["A"], type=pa.utf8()), + ], + schema=pa.schema( + [ + ("b", pa.bool_()), + ("i", pa.int64()), + ("d", pa.float64()), + ("dt", pa.timestamp("ns", tz="UTC")), + ("dte", pa.date32()), + ("t", pa.time64("ns")), + ("td", pa.duration("ns")), + ("s", pa.utf8()), + ("e", pa.utf8()), + ] + ), + ) + field_map = {k: k for k in AllTypesStruct.metadata().keys()} + result_batches = _run_reverse_round_trip(original, AllTypesStruct, field_map) + + assert len(result_batches) == 1 + result = result_batches[0] + assert result.num_rows == 1 + assert result.column("b").to_pylist() == [True] + assert result.column("i").to_pylist() == [123] + assert result.column("d").to_pylist() == pytest.approx([123.456]) + assert result.column("s").to_pylist() == ["hello"] + assert result.column("e").to_pylist() == ["A"] + + +# ===================================================================== +# Tests: memory leak detection +# ===================================================================== + + +class TestMemoryLeak: + """Run repeated conversions and check that memory does not grow unboundedly. + + Uses psutil to measure RSS. Each test runs a large number of iterations with + substantial data per iteration, then checks that memory growth after warmup + stays within a reasonable bound. A real leak of even a few KB per iteration + would accumulate to hundreds of MB over 5000 iterations. + """ + + @staticmethod + def _get_rss_mb(): + import psutil + + return psutil.Process().memory_info().rss / (1024 * 1024) + + def test_struct_to_batch_to_struct_no_leak(self): + """Repeated struct → batch → struct should not leak memory.""" + import gc + + n_warmup = 50 + n_iters = 5000 + rows_per_iter = 500 + + field_map = {"id": "id", "values": "values"} + read_schema = pa.schema([("id", pa.int64()), ("values", pa.list_(pa.float64()))]) + + def run_one(): + structs = [NumpyStruct(id=i, values=np.random.rand(100)) for i in range(rows_per_iter)] + + @csp.graph + def g(s_: object): + data = csp.const(s_) + batches = struct_to_record_batches(data, NumpyStruct, field_map) + result = record_batches_to_struct(batches, NumpyStruct, field_map, read_schema) + csp.add_graph_output("r", result) + + csp.run(g, structs, starttime=_STARTTIME, endtime=_STARTTIME + timedelta(seconds=1)) + + # Warmup — let allocators, caches, JIT, etc. stabilize + for _ in range(n_warmup): + run_one() + + gc.collect() + baseline_mb = self._get_rss_mb() + + for _ in range(n_iters): + run_one() + + gc.collect() + final_mb = self._get_rss_mb() + growth_mb = final_mb - baseline_mb + + # With 5000 iterations x 500 rows x 100 floats, total data processed is ~2GB. + # A leak of even 1KB/iter would be 5MB. We allow 100MB for allocator noise. + assert growth_mb < 100, ( + f"Memory grew by {growth_mb:.1f} MB over {n_iters} iterations " + f"({rows_per_iter} rows/iter, baseline={baseline_mb:.1f} MB, " + f"final={final_mb:.1f} MB) — possible leak" + ) + + def test_batch_to_struct_to_batch_no_leak(self): + """Repeated batch → struct → batch should not leak memory.""" + import gc + + n_warmup = 50 + n_iters = 5000 + rows_per_iter = 500 + + field_map = {"id": "id", "values": "values"} + read_schema = pa.schema([("id", pa.int64()), ("values", pa.list_(pa.float64()))]) + + def make_batch(): + ids = list(range(rows_per_iter)) + vals = [np.random.rand(100).tolist() for _ in range(rows_per_iter)] + return pa.RecordBatch.from_pydict( + {"id": ids, "values": vals}, + schema=read_schema, + ) + + def run_one(): + batch = make_batch() + + @csp.graph + def g(b_: object, schema_: object): + data = csp.const([b_]) + structs = record_batches_to_struct(data, NumpyStruct, field_map, schema_) + batches = struct_to_record_batches(structs, NumpyStruct, field_map) + csp.add_graph_output("r", batches) + + csp.run(g, batch, read_schema, starttime=_STARTTIME, endtime=_STARTTIME + timedelta(seconds=1)) + + # Warmup + for _ in range(n_warmup): + run_one() + + gc.collect() + baseline_mb = self._get_rss_mb() + + for _ in range(n_iters): + run_one() + + gc.collect() + final_mb = self._get_rss_mb() + growth_mb = final_mb - baseline_mb + + assert growth_mb < 100, ( + f"Memory grew by {growth_mb:.1f} MB over {n_iters} iterations " + f"({rows_per_iter} rows/iter, baseline={baseline_mb:.1f} MB, " + f"final={final_mb:.1f} MB) — possible leak" + ) + + +# ===================================================================== +# Tests for all Arrow reader types (ensuring full type coverage) +# ===================================================================== + + +class TestReadAllArrowTypes: + """Test every Arrow type supported by ArrowFieldReader. + + CSP Python only supports int (int64) for integers. These tests verify that + narrow Arrow integer types (int8/16/32, uint8/16/32/64) are correctly + widened to int64 when reading into CSP struct fields. + """ + + # --- Narrow integer types --- + + @pytest.mark.parametrize( + "arrow_type", + [pa.int8(), pa.int16(), pa.int32(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()], + ids=["int8", "int16", "int32", "uint8", "uint16", "uint32", "uint64"], + ) + def test_narrow_integer_types(self, arrow_type): + """Arrow narrow integers should widen to CSP int64.""" + arr = pa.array([10, 20, 30], type=arrow_type) + batch = pa.record_batch( + { + "i64": arr, + "f64": pa.array([1.0, 2.0, 3.0]), + "s": pa.array(["a", "b", "c"]), + "b": pa.array([True, False, True]), + } + ) + field_map = {"i64": "i64", "f64": "f64", "s": "s", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert len(structs) == 3 + assert structs[0].i64 == 10 + assert structs[1].i64 == 20 + assert structs[2].i64 == 30 + + # --- Float types --- + + def test_float32(self): + """Arrow float32 should widen to CSP float64.""" + arr = pa.array([1.5, 2.5, 3.5], type=pa.float32()) + batch = pa.record_batch( + {"f64": arr, "i64": pa.array([1, 2, 3]), "s": pa.array(["a", "b", "c"]), "b": pa.array([True, False, True])} + ) + field_map = {"f64": "f64", "i64": "i64", "s": "s", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert abs(structs[0].f64 - 1.5) < 1e-5 + assert abs(structs[1].f64 - 2.5) < 1e-5 + + def test_float16(self): + """Arrow float16 (half_float) should widen to CSP float64.""" + # PyArrow float16 has limited precision + arr = pa.array([1.0, 2.0, 3.0], type=pa.float16()) + batch = pa.record_batch( + {"f64": arr, "i64": pa.array([1, 2, 3]), "s": pa.array(["a", "b", "c"]), "b": pa.array([True, False, True])} + ) + field_map = {"f64": "f64", "i64": "i64", "s": "s", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert abs(structs[0].f64 - 1.0) < 0.1 + assert abs(structs[1].f64 - 2.0) < 0.1 + assert abs(structs[2].f64 - 3.0) < 0.1 + + # --- String variants --- + + def test_large_string(self): + """Arrow large_string should read into CSP str.""" + arr = pa.array(["hello", "world", "test"], type=pa.large_string()) + batch = pa.record_batch( + {"s": arr, "i64": pa.array([1, 2, 3]), "f64": pa.array([1.0, 2.0, 3.0]), "b": pa.array([True, False, True])} + ) + field_map = {"s": "s", "i64": "i64", "f64": "f64", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert structs[0].s == "hello" + assert structs[1].s == "world" + assert structs[2].s == "test" + + # --- Binary variants --- + + def test_large_binary(self): + """Arrow large_binary should read into CSP bytes.""" + arr = pa.array([b"\x01\x02", b"\x03\x04", b"\x05"], type=pa.large_binary()) + batch = pa.record_batch({"data": arr}) + field_map = {"data": "data"} + structs = _run_to_struct(batch, BytesStruct, field_map, batch.schema) + assert structs[0].data == b"\x01\x02" + assert structs[1].data == b"\x03\x04" + assert structs[2].data == b"\x05" + + def test_fixed_size_binary(self): + """Arrow fixed_size_binary should read into CSP bytes.""" + arr = pa.array([b"\x01\x02\x03", b"\x04\x05\x06", b"\x07\x08\x09"], type=pa.binary(3)) + batch = pa.record_batch({"data": arr}) + field_map = {"data": "data"} + structs = _run_to_struct(batch, BytesStruct, field_map, batch.schema) + assert structs[0].data == b"\x01\x02\x03" + assert structs[1].data == b"\x04\x05\x06" + assert structs[2].data == b"\x07\x08\x09" + + # --- Timestamp units --- + + @pytest.mark.parametrize( + "unit", + ["s", "ms", "us", "ns"], + ids=["seconds", "milliseconds", "microseconds", "nanoseconds"], + ) + def test_timestamp_units(self, unit): + """All timestamp units should read into CSP datetime.""" + dt_val = datetime(2023, 6, 15, 12, 30, 45) + arr = pa.array([dt_val], type=pa.timestamp(unit)) + batch = pa.record_batch( + { + "dt": arr, + "td": pa.array([timedelta(seconds=1)]), + "d": pa.array([date(2023, 6, 15)]), + "t": pa.array([time(12, 30, 45)]), + } + ) + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + structs = _run_to_struct(batch, DateTimeStruct, field_map, batch.schema) + assert structs[0].dt == dt_val + + # --- Duration units --- + + @pytest.mark.parametrize( + "unit", + ["s", "ms", "us", "ns"], + ids=["seconds", "milliseconds", "microseconds", "nanoseconds"], + ) + def test_duration_units(self, unit): + """All duration units should read into CSP timedelta.""" + td_val = timedelta(seconds=42) + arr = pa.array([td_val], type=pa.duration(unit)) + batch = pa.record_batch( + { + "td": arr, + "dt": pa.array([datetime(2023, 1, 1)]), + "d": pa.array([date(2023, 1, 1)]), + "t": pa.array([time(0, 0, 0)]), + } + ) + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + structs = _run_to_struct(batch, DateTimeStruct, field_map, batch.schema) + assert structs[0].td == td_val + + # --- Date64 --- + + def test_date64(self): + """Arrow date64 should read into CSP date.""" + d_val = date(2023, 6, 15) + # date64 stores milliseconds since epoch + arr = pa.array([d_val], type=pa.date64()) + batch = pa.record_batch( + { + "d": arr, + "dt": pa.array([datetime(2023, 1, 1)]), + "td": pa.array([timedelta(seconds=1)]), + "t": pa.array([time(0, 0, 0)]), + } + ) + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + structs = _run_to_struct(batch, DateTimeStruct, field_map, batch.schema) + assert structs[0].d == d_val + + # --- Time32 --- + + @pytest.mark.parametrize("unit", ["s", "ms"], ids=["seconds", "milliseconds"]) + def test_time32(self, unit): + """Arrow time32 (s, ms) should read into CSP time.""" + t_val = time(12, 30, 45) + arr = pa.array([t_val], type=pa.time32(unit)) + batch = pa.record_batch( + { + "t": arr, + "dt": pa.array([datetime(2023, 1, 1)]), + "td": pa.array([timedelta(seconds=1)]), + "d": pa.array([date(2023, 1, 1)]), + } + ) + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + structs = _run_to_struct(batch, DateTimeStruct, field_map, batch.schema) + assert structs[0].t.hour == 12 + assert structs[0].t.minute == 30 + assert structs[0].t.second == 45 + + # --- Time64 --- + + @pytest.mark.parametrize("unit", ["us", "ns"], ids=["microseconds", "nanoseconds"]) + def test_time64(self, unit): + """Arrow time64 (us, ns) should read into CSP time.""" + t_val = time(14, 15, 16, 123456) + arr = pa.array([t_val], type=pa.time64(unit)) + batch = pa.record_batch( + { + "t": arr, + "dt": pa.array([datetime(2023, 1, 1)]), + "td": pa.array([timedelta(seconds=1)]), + "d": pa.array([date(2023, 1, 1)]), + } + ) + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + structs = _run_to_struct(batch, DateTimeStruct, field_map, batch.schema) + assert structs[0].t.hour == 14 + assert structs[0].t.minute == 15 + assert structs[0].t.second == 16 + + # --- Dictionary-encoded string --- + + def test_dictionary_string(self): + """Arrow dictionary-encoded string should read into CSP str.""" + arr = pa.array(["foo", "bar", "foo", "baz"]).dictionary_encode() + batch = pa.record_batch( + { + "s": arr, + "i64": pa.array([1, 2, 3, 4]), + "f64": pa.array([1.0, 2.0, 3.0, 4.0]), + "b": pa.array([True, True, False, False]), + } + ) + field_map = {"s": "s", "i64": "i64", "f64": "f64", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert structs[0].s == "foo" + assert structs[1].s == "bar" + assert structs[2].s == "foo" + assert structs[3].s == "baz" + + # --- Dictionary-encoded enum --- + + def test_dictionary_enum(self): + """Arrow dictionary-encoded string should read into CSP enum.""" + arr = pa.array(["A", "B", "C", "A"]).dictionary_encode() + batch = pa.record_batch({"label": pa.array(["x", "y", "z", "w"]), "color": arr}) + field_map = {"label": "label", "color": "color"} + structs = _run_to_struct(batch, EnumStruct, field_map, batch.schema) + assert structs[0].color == MyEnum.A + assert structs[1].color == MyEnum.B + assert structs[2].color == MyEnum.C + assert structs[3].color == MyEnum.A + + # --- Enum from large_string --- + + def test_enum_from_large_string(self): + """Arrow large_string should read into CSP enum.""" + arr = pa.array(["B", "C", "A"], type=pa.large_string()) + batch = pa.record_batch({"label": pa.array(["x", "y", "z"]), "color": arr}) + field_map = {"label": "label", "color": "color"} + structs = _run_to_struct(batch, EnumStruct, field_map, batch.schema) + assert structs[0].color == MyEnum.B + assert structs[1].color == MyEnum.C + assert structs[2].color == MyEnum.A + + # --- Null handling for all types --- + + def test_narrow_int_with_nulls(self): + """Arrow narrow int with nulls should leave CSP field unset.""" + arr = pa.array([10, None, 30], type=pa.int16()) + batch = pa.record_batch({"x": arr, "y": pa.array([1.0, 2.0, 3.0])}) + field_map = {"x": "x", "y": "y"} + structs = _run_to_struct(batch, NumericOnlyStruct, field_map, batch.schema) + assert structs[0].x == 10 + assert not hasattr(structs[1], "x") # null -> unset + assert structs[2].x == 30 + + def test_float16_with_nulls(self): + """Arrow float16 with nulls should leave CSP field unset.""" + arr = pa.array([1.0, None, 3.0], type=pa.float16()) + batch = pa.record_batch( + {"f64": arr, "i64": pa.array([1, 2, 3]), "s": pa.array(["a", "b", "c"]), "b": pa.array([True, False, True])} + ) + field_map = {"f64": "f64", "i64": "i64", "s": "s", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert abs(structs[0].f64 - 1.0) < 0.1 + assert not hasattr(structs[1], "f64") + assert abs(structs[2].f64 - 3.0) < 0.1 + + def test_large_string_with_nulls(self): + """Arrow large_string with nulls should leave CSP field unset.""" + arr = pa.array(["hello", None, "world"], type=pa.large_string()) + batch = pa.record_batch( + {"s": arr, "i64": pa.array([1, 2, 3]), "f64": pa.array([1.0, 2.0, 3.0]), "b": pa.array([True, False, True])} + ) + field_map = {"s": "s", "i64": "i64", "f64": "f64", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert structs[0].s == "hello" + assert not hasattr(structs[1], "s") + assert structs[2].s == "world" + + def test_large_binary_with_nulls(self): + """Arrow large_binary with nulls should leave CSP field unset.""" + arr = pa.array([b"\x01", None, b"\x03"], type=pa.large_binary()) + batch = pa.record_batch({"data": arr}) + field_map = {"data": "data"} + structs = _run_to_struct(batch, BytesStruct, field_map, batch.schema) + assert structs[0].data == b"\x01" + assert not hasattr(structs[1], "data") + assert structs[2].data == b"\x03" + + def test_fixed_size_binary_with_nulls(self): + """Arrow fixed_size_binary with nulls should leave CSP field unset.""" + arr = pa.array([b"\x01\x02\x03", None, b"\x07\x08\x09"], type=pa.binary(3)) + batch = pa.record_batch({"data": arr}) + field_map = {"data": "data"} + structs = _run_to_struct(batch, BytesStruct, field_map, batch.schema) + assert structs[0].data == b"\x01\x02\x03" + assert not hasattr(structs[1], "data") + assert structs[2].data == b"\x07\x08\x09" + + def test_enum_from_string_with_nulls(self): + """Arrow string enum with nulls should leave CSP field unset.""" + arr = pa.array(["A", None, "C"]) + batch = pa.record_batch({"label": pa.array(["x", "y", "z"]), "color": arr}) + field_map = {"label": "label", "color": "color"} + structs = _run_to_struct(batch, EnumStruct, field_map, batch.schema) + assert structs[0].color == MyEnum.A + assert not hasattr(structs[1], "color") + assert structs[2].color == MyEnum.C + + def test_enum_from_large_string_with_nulls(self): + """Arrow large_string enum with nulls should leave CSP field unset.""" + arr = pa.array(["B", None, "A"], type=pa.large_string()) + batch = pa.record_batch({"label": pa.array(["x", "y", "z"]), "color": arr}) + field_map = {"label": "label", "color": "color"} + structs = _run_to_struct(batch, EnumStruct, field_map, batch.schema) + assert structs[0].color == MyEnum.B + assert not hasattr(structs[1], "color") + assert structs[2].color == MyEnum.A + + def test_dictionary_string_with_nulls(self): + """Arrow dictionary-encoded string with nulls should leave CSP field unset.""" + arr = pa.array(["foo", None, "baz"]).dictionary_encode() + batch = pa.record_batch( + {"s": arr, "i64": pa.array([1, 2, 3]), "f64": pa.array([1.0, 2.0, 3.0]), "b": pa.array([True, False, True])} + ) + field_map = {"s": "s", "i64": "i64", "f64": "f64", "b": "b"} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert structs[0].s == "foo" + assert not hasattr(structs[1], "s") + assert structs[2].s == "baz" + + def test_dictionary_enum_with_nulls(self): + """Arrow dictionary-encoded enum with nulls should leave CSP field unset.""" + arr = pa.array(["A", None, "C"]).dictionary_encode() + batch = pa.record_batch({"label": pa.array(["x", "y", "z"]), "color": arr}) + field_map = {"label": "label", "color": "color"} + structs = _run_to_struct(batch, EnumStruct, field_map, batch.schema) + assert structs[0].color == MyEnum.A + assert not hasattr(structs[1], "color") + assert structs[2].color == MyEnum.C + + def test_date64_with_nulls(self): + """Arrow date64 with nulls should leave CSP field unset.""" + + class DateOnlyStruct(csp.Struct): + d: date + + arr = pa.array([date(2023, 6, 15), None, date(2024, 1, 1)], type=pa.date64()) + batch = pa.record_batch({"d": arr}) + field_map = {"d": "d"} + structs = _run_to_struct(batch, DateOnlyStruct, field_map, batch.schema) + assert structs[0].d == date(2023, 6, 15) + assert not hasattr(structs[1], "d") + assert structs[2].d == date(2024, 1, 1) + + def test_time32_with_nulls(self): + """Arrow time32 with nulls should leave CSP field unset.""" + + class TimeOnlyStruct(csp.Struct): + t: time + + arr = pa.array([time(12, 30, 0), None, time(14, 0, 0)], type=pa.time32("s")) + batch = pa.record_batch({"t": arr}) + field_map = {"t": "t"} + structs = _run_to_struct(batch, TimeOnlyStruct, field_map, batch.schema) + assert structs[0].t.hour == 12 + assert structs[0].t.minute == 30 + assert not hasattr(structs[1], "t") + assert structs[2].t.hour == 14 + + def test_time64_with_nulls(self): + """Arrow time64 with nulls should leave CSP field unset.""" + + class TimeOnlyStruct(csp.Struct): + t: time + + arr = pa.array([time(14, 15, 16), None, time(0, 0, 1)], type=pa.time64("us")) + batch = pa.record_batch({"t": arr}) + field_map = {"t": "t"} + structs = _run_to_struct(batch, TimeOnlyStruct, field_map, batch.schema) + assert structs[0].t.hour == 14 + assert not hasattr(structs[1], "t") + assert structs[2].t.hour == 0 + assert structs[2].t.second == 1 + + # --- All-null column --- + + def test_all_null_column(self): + """A column where every value is null should leave all struct fields unset.""" + arr = pa.array([None, None, None], type=pa.int64()) + batch = pa.record_batch({"x": arr, "y": pa.array([1.0, 2.0, 3.0])}) + field_map = {"x": "x", "y": "y"} + structs = _run_to_struct(batch, NumericOnlyStruct, field_map, batch.schema) + assert len(structs) == 3 + for s in structs: + assert not hasattr(s, "x") + assert structs[0].y == pytest.approx(1.0) + + # --- Multiple rows with mixed narrow types --- + + def test_many_rows_narrow_int(self): + """Read many rows from a narrow int column.""" + n = 1000 + arr = pa.array(list(range(n)), type=pa.int32()) + batch = pa.record_batch({"x": arr, "y": pa.array([float(i) for i in range(n)])}) + field_map = {"x": "x", "y": "y"} + structs = _run_to_struct(batch, NumericOnlyStruct, field_map, batch.schema) + assert len(structs) == n + for i in range(n): + assert structs[i].x == i + assert structs[i].y == float(i) + + # --- Binary with nulls --- + + def test_binary_with_nulls(self): + """Arrow binary with nulls should leave CSP field unset.""" + arr = pa.array([b"\x01", None, b"\x03"]) + batch = pa.record_batch({"data": arr}) + field_map = {"data": "data"} + structs = _run_to_struct(batch, BytesStruct, field_map, batch.schema) + assert structs[0].data == b"\x01" + assert not hasattr(structs[1], "data") + assert structs[2].data == b"\x03" + + +# ===================================================================== +# Tests: null handling for common read types +# ===================================================================== + + +class TestReadNullHandling: + """Test that null values in various Arrow column types leave CSP struct fields unset.""" + + @pytest.mark.parametrize("null_field", ["i64", "f64", "s", "b"]) + def test_null_basic_scalar(self, null_field): + """Null in a basic scalar column (int64, float64, string, bool) leaves the field unset.""" + data = {"i64": [1, 2, 3], "f64": [1.0, 2.0, 3.0], "s": ["a", "b", "c"], "b": [True, False, True]} + data[null_field] = [data[null_field][0], None, data[null_field][2]] + batch = pa.record_batch(data) + field_map = {k: k for k in data} + structs = _run_to_struct(batch, ScalarStruct, field_map, batch.schema) + assert hasattr(structs[0], null_field) + assert not hasattr(structs[1], null_field) + assert hasattr(structs[2], null_field) + + @pytest.mark.parametrize("null_field", ["dt", "td", "d", "t"]) + def test_null_temporal(self, null_field): + """Null in a temporal column (datetime, timedelta, date, time) leaves the field unset.""" + vals = { + "dt": [datetime(2024, 1, 1), datetime(2024, 1, 2)], + "td": [timedelta(seconds=42), timedelta(seconds=2)], + "d": [date(2024, 6, 15), date(2024, 1, 2)], + "t": [time(14, 30, 0), time(13, 0, 0)], + } + # Set the null field's second value to None + arrays = [] + schema_fields = [ + ("dt", pa.timestamp("ns", tz="UTC")), + ("td", pa.duration("ns")), + ("d", pa.date32()), + ("t", pa.time64("ns")), + ] + for field_name, arrow_type in schema_fields: + v = vals[field_name] + if field_name == null_field: + v = [v[0], None] + arrays.append(pa.array(v, type=arrow_type)) + schema = pa.schema(schema_fields) + batch = pa.RecordBatch.from_arrays(arrays, schema=schema) + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + structs = _run_to_struct(batch, DateTimeStruct, field_map, batch.schema) + assert hasattr(structs[0], null_field) + assert not hasattr(structs[1], null_field) + + def test_null_nested_struct(self): + """Null nested struct should leave the field unset, and child readers stay in sync.""" + inner_type = pa.struct([("x", pa.int64()), ("y", pa.float64())]) + inner_arr = pa.StructArray.from_arrays( + [pa.array([42, None, 99]), pa.array([2.5, None, 9.9])], + fields=[pa.field("x", pa.int64()), pa.field("y", pa.float64())], + mask=pa.array([False, True, False]), # second row is null + ) + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), inner_arr], + schema=pa.schema([("id", pa.int64()), ("inner", inner_type)]), + ) + field_map = {"id": "id", "inner": "inner"} + structs = _run_to_struct(batch, NestedStruct, field_map, batch.schema) + + assert len(structs) == 3 + assert structs[0].inner.x == 42 + assert structs[0].inner.y == pytest.approx(2.5) + assert not hasattr(structs[1], "inner") + assert structs[2].inner.x == 99 + assert structs[2].inner.y == pytest.approx(9.9) + + def test_empty_batch_read(self): + """Reading an empty RecordBatch should produce an empty list.""" + schema = pa.schema([("x", pa.int64()), ("y", pa.float64())]) + batch = pa.RecordBatch.from_pydict({"x": [], "y": []}, schema=schema) + field_map = {"x": "x", "y": "y"} + structs = _run_to_struct(batch, NumericOnlyStruct, field_map, batch.schema) + assert len(structs) == 0 + + +# ===================================================================== +# Tests: null handling and edge cases for write direction +# ===================================================================== + + +class TestWriteNullAndEdgeCases: + """Test null/unset fields and edge cases in the write direction.""" + + def test_null_datetime_fields(self): + """Unset datetime/timedelta/date/time fields should become null in Arrow.""" + s = DateTimeStruct(dt=datetime(2024, 1, 1)) # only dt is set + field_map = {"dt": "dt", "td": "td", "d": "d", "t": "t"} + batches = _run_to_batches([s], DateTimeStruct, field_map) + + batch = batches[0] + assert batch.column("td").to_pylist() == [None] + assert batch.column("d").to_pylist() == [None] + assert batch.column("t").to_pylist() == [None] + + def test_null_enum_field(self): + """Unset enum field should become null in Arrow.""" + s = EnumStruct(label="x") # color is unset + field_map = {"label": "label", "color": "color"} + batches = _run_to_batches([s], EnumStruct, field_map) + + batch = batches[0] + assert batch.column("label").to_pylist() == ["x"] + assert batch.column("color").to_pylist() == [None] + + def test_null_nested_struct_field(self): + """Unset nested struct field should become null in Arrow.""" + s = NestedStruct(id=1) # inner is unset + field_map = {"id": "id", "inner": "inner"} + batches = _run_to_batches([s], NestedStruct, field_map) + + batch = batches[0] + assert batch.column("id").to_pylist() == [1] + assert batch.column("inner").to_pylist() == [None] + + def test_null_ndarray_field(self): + """Unset NDArray field should produce null in both data and dims columns.""" + s = NDArrayStruct(id=1) # matrix is unset + field_map = {"id": "id", "matrix": "matrix"} + batches = _run_to_batches([s], NDArrayStruct, field_map) + + batch = batches[0] + assert batch.column("id").to_pylist() == [1] + assert batch.column("matrix").to_pylist() == [None] + assert batch.column("matrix_csp_dimensions").to_pylist() == [None] + + def test_empty_struct_vector_write(self): + """Writing a tick with a single row followed by an empty tick should produce zero-row batch.""" + field_map = {"x": "x", "y": "y"} + tick1 = [NumericOnlyStruct(x=1, y=1.0)] + tick2 = [NumericOnlyStruct(x=2, y=2.0)] # need a non-empty tick to validate structure + + # Verify single-row ticks work (empty list not expressible via csp.const) + all_results = _run_multi_tick_write([tick1, tick2], NumericOnlyStruct, field_map) + assert len(all_results) == 2 + assert all_results[0][0].num_rows == 1 + assert all_results[1][0].num_rows == 1 + + +# ===================================================================== +# Tests: struct_to_record_batches with field_map=None and numpy fields +# ===================================================================== + + +class TestWriteAutoDetectNumpy: + """Test struct_to_record_batches with field_map=None for structs with numpy fields.""" + + def test_no_field_map_numpy_1d(self): + """Auto-detect numpy 1D array fields when field_map is None.""" + structs = [ + NumpyStruct(id=1, values=np.array([1.0, 2.0, 3.0])), + NumpyStruct(id=2, values=np.array([4.0, 5.0])), + ] + batches = _run_to_batches(structs, NumpyStruct) + + assert len(batches) == 1 + batch = batches[0] + assert batch.num_rows == 2 + # Scalar fields auto-included + assert batch.column("id").to_pylist() == [1, 2] + # Numpy field auto-detected + vals = batch.column("values").to_pylist() + assert vals[0] == pytest.approx([1.0, 2.0, 3.0]) + assert vals[1] == pytest.approx([4.0, 5.0]) + + def test_no_field_map_ndarray(self): + """Auto-detect NDArray fields with dimension columns when field_map is None.""" + matrix = np.array([[1.0, 2.0], [3.0, 4.0]]) + structs = [NDArrayStruct(id=1, matrix=matrix)] + batches = _run_to_batches(structs, NDArrayStruct) + + assert len(batches) == 1 + batch = batches[0] + assert batch.num_rows == 1 + assert batch.column("id").to_pylist() == [1] + data_col = batch.column("matrix").to_pylist() + assert data_col[0] == pytest.approx([1.0, 2.0, 3.0, 4.0]) + dims_col = batch.column("matrix_csp_dimensions").to_pylist() + assert dims_col[0] == [2, 2] + + def test_no_field_map_mixed_scalar_numpy(self): + """Auto-detect with mixed scalar + numpy fields.""" + structs = [ + MixedStruct(label="a", scores=np.array([0.1, 0.2])), + ] + batches = _run_to_batches(structs, MixedStruct) + + assert len(batches) == 1 + batch = batches[0] + assert batch.column("label").to_pylist() == ["a"] + vals = batch.column("scores").to_pylist() + assert vals[0] == pytest.approx([0.1, 0.2]) + + +# ===================================================================== +# Tests: round-trip for all numpy element types +# ===================================================================== + + +class TestNumpyTypeRoundTrips: + """Round-trip tests for numpy 1D arrays with int, str, and bool element types.""" + + def test_numpy_int_round_trip(self): + structs = [NumpyIntStruct(id=1, values=np.array([10, 20, 30], dtype=np.int64))] + field_map = {"id": "id", "values": "values"} + schema = pa.schema([("id", pa.int64()), ("values", pa.list_(pa.int64()))]) + result = _run_round_trip(structs, NumpyIntStruct, field_map, schema) + assert len(result) == 1 + np.testing.assert_array_equal(result[0].values, [10, 20, 30]) + + def test_numpy_string_round_trip(self): + structs = [NumpyStringStruct(id=1, names=np.array(["alice", "bob"]))] + field_map = {"id": "id", "names": "names"} + schema = pa.schema([("id", pa.int64()), ("names", pa.list_(pa.utf8()))]) + result = _run_round_trip(structs, NumpyStringStruct, field_map, schema) + assert len(result) == 1 + np.testing.assert_array_equal(result[0].names, ["alice", "bob"]) + + def test_numpy_bool_round_trip(self): + structs = [NumpyBoolStruct(id=1, flags=np.array([True, False, True]))] + field_map = {"id": "id", "flags": "flags"} + schema = pa.schema([("id", pa.int64()), ("flags", pa.list_(pa.bool_()))]) + result = _run_round_trip(structs, NumpyBoolStruct, field_map, schema) + assert len(result) == 1 + np.testing.assert_array_equal(result[0].flags, [True, False, True]) + + +# ===================================================================== +# Tests: NDArray with int element type +# ===================================================================== + + +class TestNDArrayIntType: + """Test NDArray with int element type (read, write, round-trip).""" + + def test_ndarray_int_write(self): + matrix = np.array([[10, 20, 30], [40, 50, 60]], dtype=np.int64) + structs = [NDArrayIntStruct(id=1, matrix=matrix)] + field_map = {"id": "id", "matrix": "matrix"} + batches = _run_to_batches(structs, NDArrayIntStruct, field_map) + + batch = batches[0] + data_col = batch.column("matrix").to_pylist() + assert data_col[0] == [10, 20, 30, 40, 50, 60] + dims_col = batch.column("matrix_csp_dimensions").to_pylist() + assert dims_col[0] == [2, 3] + + def test_ndarray_int_read(self): + batch = pa.RecordBatch.from_pydict( + { + "id": [1], + "matrix": [[10, 20, 30, 40, 50, 60]], + "matrix_csp_dimensions": [[2, 3]], + }, + schema=pa.schema( + [ + ("id", pa.int64()), + ("matrix", pa.list_(pa.int64())), + ("matrix_csp_dimensions", pa.list_(pa.int64())), + ] + ), + ) + field_map = {"id": "id", "matrix": "matrix"} + structs = _run_to_struct(batch, NDArrayIntStruct, field_map, batch.schema) + + assert len(structs) == 1 + expected = np.array([[10, 20, 30], [40, 50, 60]], dtype=np.int64) + np.testing.assert_array_equal(structs[0].matrix, expected) + assert structs[0].matrix.shape == (2, 3) + + def test_ndarray_int_round_trip(self): + matrix = np.array([[10, 20], [30, 40]], dtype=np.int64) + structs = [NDArrayIntStruct(id=1, matrix=matrix)] + field_map = {"id": "id", "matrix": "matrix"} + schema = pa.schema( + [ + ("id", pa.int64()), + ("matrix", pa.list_(pa.int64())), + ("matrix_csp_dimensions", pa.list_(pa.int64())), + ] + ) + result = _run_round_trip(structs, NDArrayIntStruct, field_map, schema) + assert len(result) == 1 + np.testing.assert_array_equal(result[0].matrix, matrix) + assert result[0].matrix.shape == (2, 2) + + +# ===================================================================== +# Tests: mixed scalar + numpy + NDArray round-trip +# ===================================================================== + + +class TestFullMixedRoundTrip: + """Round-trip with a struct containing scalar, numpy 1D, and NDArray fields.""" + + def test_mixed_scalar_numpy_ndarray_round_trip(self): + matrix = np.array([[1.0, 2.0], [3.0, 4.0]]) + structs = [ + FullMixedStruct(label="a", scores=np.array([0.1, 0.2, 0.3]), matrix=matrix), + ] + field_map = {"label": "label", "scores": "scores", "matrix": "matrix"} + schema = pa.schema( + [ + ("label", pa.utf8()), + ("scores", pa.list_(pa.float64())), + ("matrix", pa.list_(pa.float64())), + ("matrix_csp_dimensions", pa.list_(pa.int64())), + ] + ) + result = _run_round_trip(structs, FullMixedStruct, field_map, schema) + assert len(result) == 1 + assert result[0].label == "a" + np.testing.assert_array_almost_equal(result[0].scores, [0.1, 0.2, 0.3]) + np.testing.assert_array_almost_equal(result[0].matrix, matrix) + assert result[0].matrix.shape == (2, 2) + + +# ===================================================================== +# Tests: reverse round-trip with null values +# ===================================================================== + + +class TestReverseRoundTripWithNulls: + """batch → struct → batch where the original batch contains null values.""" + + def test_scalar_nulls_reverse_round_trip(self): + """batch with nulls → struct → batch should preserve nulls.""" + original = pa.RecordBatch.from_pydict( + { + "i64": pa.array([1, None, 3]), + "f64": pa.array([1.1, 2.2, None]), + "s": pa.array([None, "b", "c"]), + "b": pa.array([True, None, False]), + } + ) + field_map = {"i64": "i64", "f64": "f64", "s": "s", "b": "b"} + result_batches = _run_reverse_round_trip(original, ScalarStruct, field_map) + + assert len(result_batches) == 1 + result = result_batches[0] + assert result.num_rows == 3 + assert result.column("i64").to_pylist() == [1, None, 3] + assert result.column("f64").to_pylist()[0] == pytest.approx(1.1) + assert result.column("f64").to_pylist()[2] is None + assert result.column("s").to_pylist() == [None, "b", "c"] + assert result.column("b").to_pylist() == [True, None, False] + + +# ===================================================================== +# Regression tests for specific bugs +# ===================================================================== + + +class TestNonContiguousArrayWrite: + """Regression: NativeListWriter must handle non-contiguous numpy arrays. + + Before the fix, PyArray_DATA + bulk AppendValues assumed C-contiguous memory + layout, silently producing wrong data for sliced or transposed arrays. + """ + + def test_sliced_1d_array(self): + """A sliced array (arr[::2]) is non-contiguous; round-trip must preserve values.""" + full = np.array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0]) + sliced = full[::2] # [10, 30, 50], non-contiguous + assert not sliced.flags["C_CONTIGUOUS"] or sliced.strides[0] != sliced.itemsize + + structs = [NumpyStruct(id=1, values=sliced)] + field_map = {"id": "id", "values": "values"} + batches = _run_to_batches(structs, NumpyStruct, field_map) + + assert len(batches) == 1 + batch = batches[0] + schema = batch.schema + read_field_map = {"id": "id", "values": "values"} + result = _run_to_struct(batch, NumpyStruct, read_field_map, schema) + + assert len(result) == 1 + np.testing.assert_array_equal(result[0].values, [10.0, 30.0, 50.0]) + + def test_sliced_int_array(self): + """Non-contiguous int array round-trip.""" + full = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) + sliced = full[1::2] # [2, 4, 6] + + structs = [NumpyIntStruct(id=1, values=sliced)] + field_map = {"id": "id", "values": "values"} + batches = _run_to_batches(structs, NumpyIntStruct, field_map) + batch = batches[0] + + result = _run_to_struct(batch, NumpyIntStruct, {"id": "id", "values": "values"}, batch.schema) + np.testing.assert_array_equal(result[0].values, [2, 4, 6]) + + def test_transposed_ndarray(self): + """A transposed 2D array is non-contiguous (Fortran order); round-trip must match.""" + original = np.array([[1.0, 2.0], [3.0, 4.0]]) + transposed = original.T # [[1, 3], [2, 4]], Fortran-order + + structs = [NDArrayStruct(id=1, matrix=transposed)] + field_map = {"id": "id", "matrix": "matrix"} + batches = _run_to_batches(structs, NDArrayStruct, field_map) + batch = batches[0] + + schema = batch.schema + result = _run_to_struct(batch, NDArrayStruct, {"id": "id", "matrix": "matrix"}, schema) + + np.testing.assert_array_equal(result[0].matrix, transposed) + assert result[0].matrix.shape == (2, 2) + + def test_fortran_order_array(self): + """Explicitly Fortran-order (column-major) array must round-trip correctly.""" + c_array = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], order="C") + f_array = np.asfortranarray(c_array) + assert not f_array.flags["C_CONTIGUOUS"] + + structs = [NDArrayStruct(id=1, matrix=f_array)] + field_map = {"id": "id", "matrix": "matrix"} + batches = _run_to_batches(structs, NDArrayStruct, field_map) + batch = batches[0] + + result = _run_to_struct(batch, NDArrayStruct, {"id": "id", "matrix": "matrix"}, batch.schema) + np.testing.assert_array_equal(result[0].matrix, f_array) + assert result[0].matrix.shape == (2, 3) + + +class TestDateWriteRegression: + """Regression: DateWriter must compute days-since-epoch correctly. + + Ensures the optimized DateWriter (single timegm call) matches the expected + Arrow Date32 representation. + """ + + def test_unix_epoch_date(self): + """1970-01-01 should produce Date32 value of 0.""" + structs = [DateTimeStruct(dt=datetime(2020, 1, 1), td=timedelta(0), d=date(1970, 1, 1), t=time(0, 0, 0))] + field_map = {"d": "d"} + batches = _run_to_batches(structs, DateTimeStruct, field_map) + batch = batches[0] + assert batch.column("d").to_pylist() == [date(1970, 1, 1)] + + def test_known_dates(self): + """Verify several known dates produce correct Date32 values.""" + known = [ + date(1970, 1, 1), + date(2000, 1, 1), + date(2020, 6, 15), + date(2024, 2, 29), # leap day + ] + structs = [DateTimeStruct(dt=datetime(2020, 1, 1), td=timedelta(0), d=d, t=time(0, 0, 0)) for d in known] + field_map = {"d": "d"} + batches = _run_to_batches(structs, DateTimeStruct, field_map) + batch = batches[0] + result_dates = batch.column("d").to_pylist() + for expected, actual in zip(known, result_dates): + assert actual == expected, f"Expected {expected}, got {actual}" + + def test_date_round_trip(self): + """Write dates through arrow adapter and read back - values must match.""" + test_dates = [date(1970, 1, 1), date(1999, 12, 31), date(2025, 7, 4)] + structs = [DateTimeStruct(dt=datetime(2020, 1, 1), td=timedelta(0), d=d, t=time(0, 0, 0)) for d in test_dates] + field_map = {"d": "d"} + batches = _run_to_batches(structs, DateTimeStruct, field_map) + batch = batches[0] + schema = batch.schema + read_field_map = {"d": "d"} + + result = _run_to_struct(batch, DateTimeStruct, read_field_map, schema) + for i, expected_date in enumerate(test_dates): + assert result[i].d == expected_date