diff --git a/core/api/core.api b/core/api/core.api index 790ce8fa29..ffeec4d531 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -663,6 +663,7 @@ public final class org/jetbrains/kotlinx/dataframe/api/ChunkedKt { } public final class org/jetbrains/kotlinx/dataframe/api/CodeString { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion; public static final synthetic fun box-impl (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/api/CodeString; public static fun constructor-impl (Ljava/lang/String;)Ljava/lang/String; public fun equals (Ljava/lang/Object;)Z @@ -676,6 +677,9 @@ public final class org/jetbrains/kotlinx/dataframe/api/CodeString { public final synthetic fun unbox-impl ()Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/api/CodeString$Companion { +} + public abstract interface class org/jetbrains/kotlinx/dataframe/api/ColColumnsSelectionDsl { public fun col (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/columns/ColumnAccessor; public fun col (Ljava/lang/String;I)Lorg/jetbrains/kotlinx/dataframe/columns/SingleColumn; @@ -2471,6 +2475,7 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/api/GlobalParser public abstract fun getNulls ()Ljava/util/Set; public abstract fun getParseExperimentalInstant ()Z public abstract fun getParseExperimentalUuid ()Z + public abstract fun getParseToDataFrameReadSource ()Z public abstract fun getSkipTypes ()Ljava/util/Set; public abstract fun getUseFastDoubleParser ()Z public abstract fun resetToDefault ()V @@ -2478,6 +2483,7 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/api/GlobalParser public abstract fun setLocale (Ljava/util/Locale;)V public abstract fun setParseExperimentalInstant (Z)V public abstract fun setParseExperimentalUuid (Z)V + public abstract fun setParseToDataFrameReadSource (Z)V public abstract fun setUseFastDoubleParser (Z)V } @@ -3618,18 +3624,19 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions { public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILkotlin/jvm/internal/DefaultConstructorMarker;)V public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;)V public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V - public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V - public synthetic fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public final fun copy (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; - public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V + public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V + public synthetic fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun copy (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; public fun equals (Ljava/lang/Object;)Z public final fun getDateTime ()Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions; public final fun getLocale ()Ljava/util/Locale; public final fun getNullStrings ()Ljava/util/Set; public final fun getParseExperimentalInstant ()Ljava/lang/Boolean; public final fun getParseExperimentalUuid ()Ljava/lang/Boolean; + public final fun getParseToDataFrameReadSource ()Ljava/lang/Boolean; public final fun getSkipTypes ()Ljava/util/Set; public final fun getUseFastDoubleParser ()Ljava/lang/Boolean; public fun hashCode ()I @@ -6028,6 +6035,37 @@ public final class org/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData$Companio public static synthetic fun tableDefinitions$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData$Companion;ZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData; } +public abstract interface class org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { +} + +public abstract interface class org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public abstract fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public abstract fun getSupportedTypes ()Ljava/util/Set; + public abstract fun getTestOrder ()I + public abstract fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun readDataFrameOrNull$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public static synthetic fun readDataFrameSchemaOrNull$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public static synthetic fun readDataSchemaCodeOrNull-myXLQ2E$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/DataSourceInfo { + public fun (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;)V + public synthetic fun (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lkotlin/reflect/KType; + public final fun component2 ()Ljava/lang/String; + public final fun component3 ()Ljava/lang/String; + public final fun copy (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo; + public fun equals (Ljava/lang/Object;)Z + public final fun getExtension ()Ljava/lang/String; + public final fun getKType ()Lkotlin/reflect/KType; + public final fun getMimeType ()Ljava/lang/String; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/DisplayConfiguration { public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/DisplayConfiguration$Companion; public synthetic fun (Ljava/lang/Integer;Ljava/lang/Integer;ILkotlin/jvm/functions/Function3;Ljava/lang/String;ZZZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V @@ -6078,6 +6116,18 @@ public final class org/jetbrains/kotlinx/dataframe/io/DisplayConfiguration$Compa public final fun getDEFAULT ()Lorg/jetbrains/kotlinx/dataframe/io/DisplayConfiguration; } +public final class org/jetbrains/kotlinx/dataframe/io/Guess2Kt { + public static final fun getNewSupportedFormats ()Ljava/util/List; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataRow; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Ljava/lang/String; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Ljava/lang/String; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; +} + public final class org/jetbrains/kotlinx/dataframe/io/GuessKt { public static final fun read (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun read (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 60e39459a5..07bdd2f173 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -59,6 +59,7 @@ dependencies { implementation(libs.commonsIo) implementation(libs.fastDoubleParser) + implementation(libs.tika) api(libs.kotlin.datetimeJvm) implementation(libs.kotlinpoet) @@ -76,7 +77,6 @@ dependencies { testImplementation(libs.kotlin.scriptingJvm) testImplementation(libs.jsoup) testImplementation(libs.sl4jsimple) - testImplementation(projects.dataframeJson) testImplementation(libs.serialization.core) testImplementation(libs.serialization.json) @@ -85,6 +85,13 @@ dependencies { // for samples.api testImplementation(projects.dataframeCsv) + testImplementation(projects.dataframeJson) + testImplementation(projects.dataframeArrow) + testImplementation(projects.dataframeExcel) +// testImplementation(projects.dataframeGeo) + testImplementation(projects.dataframeJdbc) + testImplementation(libs.h2db) + testImplementation(projects.dataframeOpenapiGenerator) } // Configure testJava16 dependencies to extend from test @@ -308,6 +315,8 @@ tasks.withType { tasks.test { maxHeapSize = "1g" + // Arrow's off-heap allocator needs deep reflection into java.nio. + jvmArgs("--add-opens", "java.base/java.nio=ALL-UNNAMED") } // Test task for Java 16+ language-specific tests diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt index b77de07bab..abad2f2631 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt @@ -291,6 +291,8 @@ public interface GlobalParserOptions { * @see [addJavaDateTimePattern] */ public var dateTimeLibrary: ParseDateTimeLibrary? + + public var parseToDataFrameReadSource: Boolean } /** @include [GlobalParserOptions] */ @@ -410,6 +412,7 @@ public class ParserOptions( public val useFastDoubleParser: Boolean? = null, public val parseExperimentalUuid: Boolean? = null, public val parseExperimentalInstant: Boolean? = null, + public val parseToDataFrameReadSource: Boolean? = null, ) { public fun copy( locale: Locale? = this.locale, @@ -419,6 +422,7 @@ public class ParserOptions( useFastDoubleParser: Boolean? = this.useFastDoubleParser, parseExperimentalUuid: Boolean? = this.parseExperimentalUuid, parseExperimentalInstant: Boolean? = this.parseExperimentalInstant, + parseToDataFrameReadSource: Boolean? = this.parseToDataFrameReadSource, ): ParserOptions = ParserOptions( locale = locale, @@ -428,6 +432,7 @@ public class ParserOptions( useFastDoubleParser = useFastDoubleParser, parseExperimentalUuid = parseExperimentalUuid, parseExperimentalInstant = parseExperimentalInstant, + parseToDataFrameReadSource = parseToDataFrameReadSource, ) override fun equals(other: Any?): Boolean { @@ -439,6 +444,7 @@ public class ParserOptions( if (useFastDoubleParser != other.useFastDoubleParser) return false if (parseExperimentalUuid != other.parseExperimentalUuid) return false if (parseExperimentalInstant != other.parseExperimentalInstant) return false + if (parseToDataFrameReadSource != other.parseToDataFrameReadSource) return false if (locale != other.locale) return false if (dateTime != other.dateTime) return false if (nullStrings != other.nullStrings) return false @@ -451,6 +457,8 @@ public class ParserOptions( var result = useFastDoubleParser?.hashCode() ?: 0 result = 31 * result + (parseExperimentalUuid?.hashCode() ?: 0) result = 31 * result + (parseExperimentalInstant?.hashCode() ?: 0) + result = 31 * result + (parseToDataFrameReadSource?.hashCode() ?: 0) + result = 31 * result + (parseExperimentalInstant?.hashCode() ?: 0) result = 31 * result + (locale?.hashCode() ?: 0) result = 31 * result + (dateTime?.hashCode() ?: 0) result = 31 * result + (nullStrings?.hashCode() ?: 0) @@ -459,7 +467,7 @@ public class ParserOptions( } override fun toString(): String = - "ParserOptions(locale=$locale, dateTimeParserOptions=$dateTime, nullStrings=$nullStrings, skipTypes=$skipTypes, useFastDoubleParser=$useFastDoubleParser, parseExperimentalUuid=$parseExperimentalUuid, parseExperimentalInstant=$parseExperimentalInstant)" + "ParserOptions(locale=$locale, dateTimeParserOptions=$dateTime, nullStrings=$nullStrings, skipTypes=$skipTypes, useFastDoubleParser=$useFastDoubleParser, parseExperimentalUuid=$parseExperimentalUuid, parseExperimentalInstant=$parseExperimentalInstant, parseToDataFrameReadSource=$parseToDataFrameReadSource)" // region deprecated constructors @@ -518,6 +526,7 @@ public class ParserOptions( useFastDoubleParser: Boolean? = null, parseExperimentalUuid: Boolean? = null, parseExperimentalInstant: Boolean? = null, + parseToDataFrameReadSource: Boolean? = null, ) : this( locale = locale, dateTime = 0.run { @@ -535,6 +544,7 @@ public class ParserOptions( useFastDoubleParser = useFastDoubleParser, parseExperimentalUuid = parseExperimentalUuid, parseExperimentalInstant = parseExperimentalInstant, + parseToDataFrameReadSource = parseToDataFrameReadSource, ) // endregion } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt index 5137cf200c..4ad31062af 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt @@ -297,6 +297,8 @@ public val NameNormalizer.Companion.default: NameNormalizer get() = NameNormaliz @RequiredByIntellijPlugin public value class CodeString(public val value: String) { override fun toString(): String = value + + public companion object } @PublishedApi diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Utils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Utils.kt index 070da0b884..467c6c7635 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Utils.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Utils.kt @@ -511,3 +511,19 @@ internal val KCallable<*>.columnName: String is KProperty<*> -> columnName else -> findAnnotation()?.name ?: getterName } + +/** + * Similar to [Result.map], but allows a new [Result] to be returned if [this] was successful. + * This result will be unpacked (so you won't get `Result>`). + */ +@PublishedApi +internal inline fun Result.flatMap(function: (T) -> Result): Result = + fold( + onSuccess = { function(it) }, + onFailure = { Result.failure(it) }, + ) + +internal fun Result>.flatten(): Result = this.flatMap { it } + +internal fun T?.toResult(exception: Throwable = NullPointerException()) = + this?.let { Result.success(it) } ?: Result.failure(exception) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index fbaafd0370..29a4f452f8 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -22,6 +22,7 @@ import kotlinx.datetime.toStdlibInstant import org.jetbrains.kotlinx.dataframe.AnyCol import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.RowColumnExpression import org.jetbrains.kotlinx.dataframe.RowValueExpression import org.jetbrains.kotlinx.dataframe.api.Convert @@ -30,7 +31,6 @@ import org.jetbrains.kotlinx.dataframe.api.Infer import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.asColumn import org.jetbrains.kotlinx.dataframe.api.isValueColumn -import org.jetbrains.kotlinx.dataframe.api.mapIndexed import org.jetbrains.kotlinx.dataframe.api.name import org.jetbrains.kotlinx.dataframe.columns.values import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME @@ -41,13 +41,14 @@ import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException import org.jetbrains.kotlinx.dataframe.impl.columns.newColumn import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType -import org.jetbrains.kotlinx.dataframe.impl.isSubtypeWithNullabilityOf +import org.jetbrains.kotlinx.dataframe.io.dataFrameReadSourceByType +import org.jetbrains.kotlinx.dataframe.io.readSource import org.jetbrains.kotlinx.dataframe.path +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import org.jetbrains.kotlinx.dataframe.type import java.math.BigDecimal import java.math.BigInteger import java.net.URL -import java.util.Locale import kotlin.math.roundToInt import kotlin.math.roundToLong import kotlin.reflect.KType @@ -197,16 +198,53 @@ private enum class DummyEnum @Suppress("UNCHECKED_CAST") internal fun createConverter(from: KType, to: KType, options: ParserOptions? = null): TypeConverter? { - if (from.arguments.isNotEmpty() || to.arguments.isNotEmpty()) return null if (from.isMarkedNullable) { val res = createConverter(from.withNullability(false), to, options) ?: return null return { res(it) } } val fromClass = from.jvmErasure val toClass = to.jvmErasure + + // early exit when we encounter types with generics (except DataFrame and DataRow), which we don't support + if (from.arguments.isNotEmpty() || + (to.arguments.isNotEmpty() && toClass !in setOf(DataFrame::class, DataRow::class)) + ) { + return null + } + + val fromTypeInDfReadSources = + dataFrameReadSourceByType.keys.any { from.isSubtypeOf(it) } || from == typeOf() + return when { fromClass == toClass -> TypeConverterIdentity + fromTypeInDfReadSources && toClass == DataFrame::class -> + convert { source -> + DataFrame.readSource( + source = source, + type = from, + readOptions = null, + ) + } + + fromTypeInDfReadSources && toClass == DataRow::class -> + convert { source -> + DataRow.readSource( + source = source, + type = from, + readOptions = null, + ) + } + + fromTypeInDfReadSources && toClass == DataFrameSchema::class -> + convert { source -> + DataFrameSchema.readSource( + source = source, + type = from, + readOptions = null, + ) + } + // kotlin.time.Duration is a value class, // so it must be handled before the generic toClass.isValue / fromClass.isValue branches. toClass == Duration::class -> when (fromClass) { @@ -231,7 +269,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n val underlyingType = constructor.parameters.single().type val converter = getConverter(from, underlyingType) ?: throw TypeConverterNotFoundException(from, underlyingType, null) - return convert { + convert { val converted = converter(it) if (converted == null && !underlyingType.isMarkedNullable) { throw TypeConversionException(it, from, underlyingType, null) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 2d654e6024..ee87240081 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -39,6 +39,7 @@ import org.jetbrains.kotlinx.dataframe.api.isFrameColumn import org.jetbrains.kotlinx.dataframe.api.isSubtypeOf import org.jetbrains.kotlinx.dataframe.api.map import org.jetbrains.kotlinx.dataframe.api.parser +import org.jetbrains.kotlinx.dataframe.api.singleOrNull import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion import org.jetbrains.kotlinx.dataframe.columns.size import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException @@ -47,10 +48,15 @@ import org.jetbrains.kotlinx.dataframe.impl.LazyMap import org.jetbrains.kotlinx.dataframe.impl.api.Parsers.resetToDefault import org.jetbrains.kotlinx.dataframe.impl.catchSilent import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType +import org.jetbrains.kotlinx.dataframe.impl.flatMap import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse import org.jetbrains.kotlinx.dataframe.impl.lazyMapOf +import org.jetbrains.kotlinx.dataframe.impl.toResult +import org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource import org.jetbrains.kotlinx.dataframe.io.isUrl +import org.jetbrains.kotlinx.dataframe.io.dataframeReadSources +import org.jetbrains.kotlinx.dataframe.io.readSourceImpl import org.jetbrains.kotlinx.dataframe.values import java.math.BigDecimal import java.math.BigInteger @@ -334,6 +340,8 @@ internal object Parsers : GlobalParserOptions { override var dateTimeLibrary: ParseDateTimeLibrary? = null + override var parseToDataFrameReadSource by Delegates.notNull() + override fun resetToDefault() { customGlobalJavaFormatters.values.forEach { it.clear() } nullStrings.clear() @@ -343,6 +351,8 @@ internal object Parsers : GlobalParserOptions { useFastDoubleParser = true parseExperimentalUuid = false parseExperimentalInstant = true + // disabled by default, because it can be very heavy + parseToDataFrameReadSource = false _locale = null dateTimeLibrary = null nullStrings.addAll(listOf("null", "NULL", "NA", "N/A")) @@ -941,40 +951,77 @@ internal object Parsers : GlobalParserOptions { stringParser { it.toBigIntegerOrNull() }, // BigDecimal stringParser { it.toBigDecimalOrNull() }, - // JSON array as DataFrame<*> - stringParser(catch = true) { - val trimmed = it.trim() - if (trimmed.startsWith("[") && trimmed.endsWith("]")) { - if (readJsonStrAnyFrame == null) { - logger.warn { - "parse() encountered a string that looks like a JSON array, but the dataframe-json dependency was not detected. Skipping for now." - } - null - } else { - readJsonStrAnyFrame!!(trimmed) + // Char + stringParser { it.singleOrNull() }, + stringParserWithOptions { options, isConverter -> + if (options?.parseToDataFrameReadSource ?: this.parseToDataFrameReadSource) { + parseBy { + readSourceImpl( + source = it, + sourceType = typeOf(), + readOptions = null, + formats = dataframeReadSources, + resultKind = "DataRow", + doStringToUrlConversion = isConverter, + read = { source, sourceInfo, options -> + readDataFrame(source, sourceInfo, options) + .flatMap { it.singleOrNull().toResult() } + }, + ).getOrNull() } } else { - null + SKIP_PARSER } }, - // JSON object as DataRow<*> - stringParser(catch = true) { - val trimmed = it.trim() - if (trimmed.startsWith("{") && trimmed.endsWith("}")) { - if (readJsonStrAnyRow == null) { - logger.warn { - "parse() encountered a string that looks like a JSON object, but the dataframe-json dependency was not detected. Skipping for now." - } - null - } else { - readJsonStrAnyRow!!(trimmed) + stringParserWithOptions { options, isConverter -> + if (options?.parseToDataFrameReadSource ?: this.parseToDataFrameReadSource) { + parseBy { + readSourceImpl( + source = it, + sourceType = typeOf(), + readOptions = null, + formats = dataframeReadSources, + resultKind = "DataFrame", + doStringToUrlConversion = isConverter, + read = DataFrameReadSource::readDataFrame, + ).getOrNull() } } else { - null + SKIP_PARSER } }, - // Char - stringParser { it.singleOrNull() }, +// // JSON array as DataFrame<*> +// stringParser(catch = true) { +// val trimmed = it.trim() +// if (trimmed.startsWith("[") && trimmed.endsWith("]")) { +// if (readJsonStrAnyFrame == null) { +// logger.warn { +// "parse() encountered a string that looks like a JSON array, but the dataframe-json dependency was not detected. Skipping for now." +// } +// null +// } else { +// readJsonStrAnyFrame!!(trimmed) +// } +// } else { +// null +// } +// }, +// // JSON object as DataRow<*> +// stringParser(catch = true) { +// val trimmed = it.trim() +// if (trimmed.startsWith("{") && trimmed.endsWith("}")) { +// if (readJsonStrAnyRow == null) { +// logger.warn { +// "parse() encountered a string that looks like a JSON object, but the dataframe-json dependency was not detected. Skipping for now." +// } +// null +// } else { +// readJsonStrAnyRow!!(trimmed) +// } +// } else { +// null +// } +// }, // No parser found, return as String // must be last in the list of parsers to return original unparsed string stringParser { it }, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt new file mode 100644 index 0000000000..161a98fab6 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -0,0 +1,604 @@ +package org.jetbrains.kotlinx.dataframe.io + +import org.apache.tika.detect.DefaultDetector +import org.apache.tika.io.TikaInputStream +import org.apache.tika.metadata.Metadata +import org.apache.tika.metadata.TikaCoreProperties +import org.apache.tika.mime.MediaType +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.CodeString +import org.jetbrains.kotlinx.dataframe.api.generateInterfaces +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.single +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema +import java.io.ByteArrayInputStream +import java.io.File +import java.io.FileNotFoundException +import java.io.IOException +import java.io.InputStream +import java.net.URI +import java.net.URL +import java.nio.file.Path +import java.util.ServiceLoader +import kotlin.io.path.Path +import kotlin.io.path.exists +import kotlin.io.path.extension +import kotlin.io.path.isRegularFile +import kotlin.io.path.name +import kotlin.reflect.KType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf + +public sealed interface DataFrameIO { + // `DataFrame.Companion.read/write` methods uses this to sort list of all supported formats in ascending order (-1, 2, 10) + // sorted list is used to test if any format can read/write the given input + public val testOrder: Int +} + +public interface DataFrameReadOptions { + public companion object; +} + +public interface DataFrameReadSource : DataFrameIO { + /** + * The set of source [KType]s this format knows how to read. The framework uses this in the default + * [acceptsSource] implementation, and overriding `acceptsSource` implementations should still consult it + * so that adding a new supported type only requires updating this set. + * + * Note: a `String` *reference* (path/URL) is normalized to a [URL] by `readSourceImpl` before any format + * is invoked, so only include `String` here when raw text content is a legitimate input (e.g., JSON/CSV + * text). For binary formats, leave `String` out. + */ + public val supportedReadingTypes: Set + + public fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions? = null, + ): Result> + + /** + * Read just the [DataFrameSchema] for [source]. + * + * The default implementation reads the full DataFrame and calls [DataFrame.schema]. Override when the + * source format can introspect types without materializing rows (e.g., JDBC metadata queries, Parquet/Arrow + * file footers, OpenAPI specs). + */ + public fun readDataFrameSchema( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions? = null, + ): Result = readDataFrame(source, sourceInfo, options).map { it.schema() } + + public fun readDataSchemaCode( + source: Any, + sourceInfo: DataSourceInfo, + name: String, + options: DataFrameReadOptions? = null, + ): Result = readDataFrameSchema(source, sourceInfo, options).map { it.generateInterfaces(name) } + + public fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean +} + +internal typealias DataFrameReadSourceFunction = + DataFrameReadSource.( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ) -> Result + +public interface DataFrameWriteOptions { + public companion object; +} + +public interface DataFrameWriteTarget : DataFrameIO { + public val supportedWritingTypes: Set + + public fun acceptsTarget(sourceInfo: DataSourceInfo, options: DataFrameWriteOptions?): Boolean + + public fun writeDataFrame( + dataFrame: DataFrame<*>, + target: Any, + targetInfo: DataSourceInfo, + options: DataFrameWriteOptions? = null, + ): Result + + public fun writeDataRow( + dataRow: DataRow<*>, + target: Any, + targetInfo: DataSourceInfo, + options: DataFrameWriteOptions? = null, + ): Result +} + +internal typealias DataFrameWriteTargetFunction = + DataFrameWriteTarget.( + dataFrameLike: T, + target: Any, + targetInfo: DataSourceInfo, + options: DataFrameWriteOptions?, + ) -> Result + +/** + * Description of a source passed to [DataFrameReadSource]. Carries the static [kType] of the value and + * optional [extension]/[mimeType] hints, both of which may be `null` when the source is in-memory content + * with no reasonable file-extension/MIME interpretation (e.g., a raw [String], [InputStream], [java.sql.Connection], + * etc.). + */ +public data class DataSourceInfo( + public val kType: KType, + public val extension: String? = null, + public val mimeType: String? = null, +) + +@PublishedApi +internal val dataFrameIO: List by lazy { + ( + ServiceLoader.load(DataFrameIO::class.java).toList() + + ServiceLoader.load(DataFrameReadSource::class.java).toList() + + ServiceLoader.load(DataFrameWriteTarget::class.java).toList() + ).distinct() + .sortedBy { it.testOrder } +} + +/** + * NOTE: Needs to have fully qualified name in + * resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource + * to be detected here. + */ +@PublishedApi +internal val dataframeReadSources: List by lazy { + dataFrameIO.filterIsInstance() +} + +@PublishedApi +internal val dataframeWriteTargets: List by lazy { + dataFrameIO.filterIsInstance() +} + +internal val dataFrameReadSourceByType: Map> by lazy { + buildMap> { + dataframeReadSources.forEach { format -> + format.supportedReadingTypes.forEach { type -> + getOrPut(type) { mutableListOf() }.let { + if (format !in it) it += format + } + } + } + values.forEach { + it.sortBy { it.testOrder } + } + } +} + +internal val dataframeWriteTargetByType: Map> by lazy { + buildMap> { + dataframeWriteTargets.forEach { format -> + format.supportedWritingTypes.forEach { type -> + getOrPut(type) { mutableListOf() }.let { + if (format !in it) it += format + } + } + } + values.forEach { + it.sortBy { it.testOrder } + } + } +} + +/** + * Shared dispatch loop for each [readSource]: handles String→URL + * normalization, InputStream buffering, sorted iteration, and error aggregation. The per-format read + * operation is supplied as [read]; [resultKind] is used only in the "unknown source" error message. + * + * @param [read] [DataFrameReadSource.readDataFrame] or [DataFrameReadSource.readDataFrameSchema] + * Potentially, this could also return another type, like a GeoDataFrame. + */ +internal fun readSourceImpl( + source: Any, + sourceType: KType, + readOptions: DataFrameReadOptions?, + formats: List, + resultKind: String, + doStringToUrlConversion: Boolean, + read: DataFrameReadSourceFunction, +): Result { + if (doStringToUrlConversion && sourceType == typeOf()) { + val url = asUrlOrNull(source as String) + if (url != null) { + return readSourceImpl( + source = url, + sourceType = typeOf(), + readOptions = readOptions, + formats = formats, + resultKind = resultKind, + doStringToUrlConversion = true, + read = read, + ) + } + } + + // Some sources can only be read once, like InputStreams, so we need to buffer them + var bufferedSource: Any? = null + + fun getSource(): Any = + when (source) { + is InputStream -> { + if (bufferedSource == null) bufferedSource = source.readBytes() + ByteArrayInputStream(bufferedSource as ByteArray) + } + + else -> source + } + + val sourceInfo = DataSourceInfo( + kType = sourceType, + extension = getSource().extensionOrNull(), + mimeType = getSource().mimeTypeOrNull(), + ) + + val tries = mutableMapOf() + formats.sortedBy { it.testOrder }.forEach { + if (!it.acceptsSource(sourceInfo, readOptions)) return@forEach + val result = it.read(getSource(), sourceInfo, readOptions) + result + .onSuccess { return Result.success(it) } + .onFailure { e -> + // fail early. File not found means the reference is broken. + if (e is FileNotFoundException) return Result.failure(exception = e) + tries[it::class.simpleName!!] = e + } + } + return Result.failure( + exception = IllegalArgumentException("Unknown $resultKind source $source, $sourceInfo; Tried $tries"), + ) +} + +internal fun writeTargetImpl( + source: T, + target: Any, + targetType: KType, + writeOptions: DataFrameWriteOptions?, + formats: List, + sourceKind: String, + doStringToPathConversion: Boolean, + write: DataFrameWriteTargetFunction, +): Result { + if (doStringToPathConversion && targetType == typeOf()) { + val path = Path(target as String) + if (path.exists() && path.isRegularFile()) { + return writeTargetImpl( + source = source, + target = path, + targetType = typeOf(), + writeOptions = writeOptions, + formats = formats, + sourceKind = sourceKind, + doStringToPathConversion = true, + write = write, + ) + } + } + + val targetInfo = DataSourceInfo( + kType = targetType, + extension = target.extensionOrNull(), + mimeType = target.mimeTypeOrNull(), + ) + + val formats = formats.sortedBy { it.testOrder } + .filter { it.acceptsTarget(targetInfo, writeOptions) } + + if (formats.isEmpty()) { + return Result.failure( + IllegalStateException( + "Failed to find a suitable format for writing $sourceKind to target: $target, $targetInfo", + ), + ) + } + if (formats.size > 1) { + return Result.failure( + IllegalStateException( + "Multiple formats found for writing $sourceKind to target: $target, $targetInfo; ${ + formats.map { + it::class.simpleName + } + } . Please specify a `DataFrameWriteOptions` explicitly.", + ), + ) + } + val format = formats.single() + val result = format.write(source, target, targetInfo, writeOptions) + return result +} + +/** + * Unified entry point for the [DataFrameReadSource] framework: passes [source] through every registered + * format until one reads it. + * + * For a [String] that points to an existing file or a recognized URL (`http://`, `https://`, `ftp://`), + * the source is normalized to a [URL] so the file-extension hint can be used to disambiguate formats. Any + * other [String] is treated as in-memory content (raw JSON/CSV/etc.). + * + * Named [readSource] rather than `read` to avoid shadowing the legacy `DataFrame.read(File/URL/Path/String, header)` + * entries in `guess.kt` that use the older [SupportedDataFrameFormat] system. Once the legacy entries are + * retired, this can be renamed to `read`. + */ +public fun DataFrame.Companion.readSource( + source: Any, + type: KType, + readOptions: DataFrameReadOptions? = null, + formats: List = dataframeReadSources, +): AnyFrame = + readSourceImpl( + source = source, + sourceType = type.withNullability(false), + readOptions = readOptions, + formats = formats, + resultKind = "DataFrame", + doStringToUrlConversion = true, + read = DataFrameReadSource::readDataFrame, + ).getOrThrow() + +public inline fun DataRow.Companion.readSource( + source: R, + readOptions: DataFrameReadOptions? = null, + formats: List = dataframeReadSources, +): AnyRow = readSource(source = source, type = typeOf(), readOptions = readOptions, formats = formats) + +public fun DataRow.Companion.readSource( + source: Any, + type: KType, + readOptions: DataFrameReadOptions? = null, + formats: List = dataframeReadSources, +): AnyRow = + readSourceImpl( + source = source, + sourceType = type.withNullability(false), + readOptions = readOptions, + formats = formats, + resultKind = "DataRow", + doStringToUrlConversion = true, + read = { source, sourceInfo, options -> + readDataFrame(source, sourceInfo, options).mapCatching { it.single() } + }, + ).getOrThrow() + +public inline fun DataFrame.Companion.readSource( + source: R, + readOptions: DataFrameReadOptions? = null, + formats: List = dataframeReadSources, +): AnyFrame = + readSource( + source = source, + type = typeOf(), + readOptions = readOptions, + formats = formats, + ) + +/** + * Schema-only counterpart of [DataFrame.Companion.readSource]: dispatches through every registered + * [DataFrameReadSource] and returns the resulting [DataFrameSchema] without materializing rows when the + * format supports it (e.g., JDBC). Formats with no fast schema path fall back to reading the full DataFrame + * and calling [DataFrame.schema]. + */ +public fun DataFrameSchema.Companion.readSource( + source: Any, + type: KType, + readOptions: DataFrameReadOptions? = null, + formats: List = dataframeReadSources, +): DataFrameSchema = + readSourceImpl( + source = source, + sourceType = type.withNullability(false), + readOptions = readOptions, + formats = formats, + resultKind = "DataFrameSchema", + doStringToUrlConversion = true, + read = DataFrameReadSource::readDataFrameSchema, + ).getOrThrow() + +public inline fun DataFrameSchema.Companion.readSource( + source: R, + readOptions: DataFrameReadOptions? = null, + formats: List = dataframeReadSources, +): DataFrameSchema = + readSource( + source = source, + type = typeOf(), + readOptions = readOptions, + formats = formats, + ) + +/** + * Code-generation counterpart of [DataFrame.Companion.readSource]: dispatches through every registered + * [DataFrameReadSource] and returns a [CodeString] containing the generated `@DataSchema` interface + * declarations (plus enums/typealiases for formats like OpenAPI). The [name] is the marker name used for + * the top-level generated interface. + * + * The default implementation in [DataFrameReadSource.readDataSchemaCode] runs + * [DataFrameSchema.generateInterfaces] on the format's [DataFrameReadSource.readDataFrameSchema] + * result; formats that produce richer code (OpenAPI markers, enums, typealiases) override the method + * directly. + */ +public fun CodeString.Companion.readSource( + source: Any, + type: KType, + name: String, + readOptions: DataFrameReadOptions? = null, + formats: List = dataframeReadSources, +): CodeString = + readSourceImpl( + source = source, + sourceType = type.withNullability(false), + readOptions = readOptions, + formats = formats, + resultKind = "CodeString", + doStringToUrlConversion = true, + read = { src, info, opts -> + readDataSchemaCode(src, info, name, opts) + }, + ).getOrThrow() + +public inline fun CodeString.Companion.readSource( + source: R, + name: String, + readOptions: DataFrameReadOptions? = null, + formats: List = dataframeReadSources, +): CodeString = + readSource( + source = source, + type = typeOf(), + name = name, + readOptions = readOptions, + formats = formats, + ) + +public fun DataFrame<*>.write( + target: Any, + type: KType, + writeOptions: DataFrameWriteOptions? = null, + formats: List = dataframeWriteTargets, +) { + writeTargetImpl( + source = this, + target = target, + targetType = type.withNullability(false), + writeOptions = writeOptions, + formats = formats, + sourceKind = "DataFrame", + doStringToPathConversion = true, + write = DataFrameWriteTarget::writeDataFrame, + ).getOrThrow() +} + +public inline fun DataFrame<*>.write( + target: W, + writeOptions: DataFrameWriteOptions? = null, + formats: List = dataframeWriteTargets, +): Unit = + write( + target = target, + type = typeOf(), + writeOptions = writeOptions, + formats = formats, + ) + +public fun DataRow<*>.write( + target: Any, + type: KType, + writeOptions: DataFrameWriteOptions? = null, + formats: List = dataframeWriteTargets, +) { + writeTargetImpl( + source = this, + target = target, + targetType = type.withNullability(false), + writeOptions = writeOptions, + formats = formats, + sourceKind = "DataRow", + doStringToPathConversion = true, + write = DataFrameWriteTarget::writeDataRow, + ).getOrThrow() +} + +public inline fun DataRow<*>.write( + target: W, + writeOptions: DataFrameWriteOptions? = null, + formats: List = dataframeWriteTargets, +): Unit = + write( + target = target, + type = typeOf(), + writeOptions = writeOptions, + formats = formats, + ) + +private val tikaDetector by lazy { DefaultDetector() } + +internal fun Any.mimeTypeOrNull(): String? { + val inputStream = try { + when (this) { + is Path -> TikaInputStream.get(this) + + is File -> + @Suppress("DEPRECATION") + TikaInputStream.get(this) + + is URL -> TikaInputStream.get(this) + + is InputStream -> TikaInputStream.get(this) + + is ByteArray -> TikaInputStream.get(this) + + else -> null + } + } catch (_: IOException) { + null + } ?: return null + + val metadata = Metadata().apply { + if (inputStream.hasFile()) { + add(TikaCoreProperties.RESOURCE_NAME_KEY, inputStream.path.name) + } + } + return try { + val detected = tikaDetector.detect(inputStream, metadata) + return when { + detected == MediaType.OCTET_STREAM -> null + detected == MediaType.TEXT_PLAIN -> null + detected == MediaType.EMPTY -> null + detected.toString().isEmpty() -> null + else -> detected.toString() + } + } catch (_: IOException) { + null + } +} + +internal fun Any.extensionOrNull(): String? = + when (this) { + is Path -> extension + + is File -> extension + + is URL -> path.takeIf { it.isNotBlank() }?.substringAfterLast('.') + + is String -> try { + asUrl(this).extensionOrNull() + } catch (_: Exception) { + null + } + + else -> null + }?.lowercase() + +/** + * Non-throwing variant of [asUrl]: returns the [URL] iff [string] is a recognized URL (`http`/`https`/`ftp`) + * or an existing file path. Used by [readSource] to decide whether a [String] should be treated as a reference + * or as raw content. + */ +internal fun asUrlOrNull(string: String): URL? = + when { + isUrl(string) -> try { + URI(string).toURL() + } catch (_: Exception) { + null + } + + else -> { + val file = try { + File(string) + } catch (_: Exception) { + null + } + if (file != null && file.exists() && file.isFile) { + file.toURI().toURL() + } else { + null + } + } + } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt new file mode 100644 index 0000000000..0c8927c139 --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -0,0 +1,797 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.assertions.throwables.shouldThrow +import io.kotest.matchers.shouldBe +import io.kotest.matchers.string.shouldContain +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonArray +import kotlinx.serialization.json.JsonElement +import kotlinx.serialization.json.JsonObject +import org.apache.poi.ss.usermodel.WorkbookFactory +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.CodeString +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.convert +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.named +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.single +import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.io.Json.WriteOptions +import org.jetbrains.kotlinx.dataframe.io.db.H2 +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema +import org.junit.Test +import java.io.ByteArrayOutputStream +import java.io.File +import java.nio.file.Files +import java.sql.Connection +import java.sql.DriverManager +import javax.sql.DataSource +import kotlin.io.path.Path +import kotlin.io.path.absolute + +class Guess2 { + + @Test + fun `read JSON reference`() { + val expected = DataFrame.readJson("../data/participants.json") + + DataFrame.readSource("../data/participants.json") shouldBe expected + DataFrame.readSource(Path("../data/participants.json")) shouldBe expected + DataFrame.readSource(File("../data/participants.json")) shouldBe expected + DataFrame.readSource( + Path("../data/participants.json").absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val readOptions = org.jetbrains.kotlinx.dataframe.io.Json.ReadOptions( + typeClashTactic = JSON.TypeClashTactic.ANY_COLUMNS, + ) + + DataFrame.readSource("../data/participants.json", readOptions) shouldBe expected + DataFrame.readSource(Path("../data/participants.json"), readOptions) shouldBe expected + DataFrame.readSource(File("../data/participants.json"), readOptions) shouldBe expected + DataFrame.readSource( + Path("../data/participants.json").absolute().normalize().toUri().toURL(), + readOptions, + ) shouldBe expected + } + + @Test + fun `read JSON in memory`() { + val expected = DataFrame.readJson("../data/participants.json") + + val file = File("../data/participants.json") + + DataFrame.readSource(file.readText()) shouldBe expected + DataFrame.readSource(file.inputStream()) shouldBe expected + DataFrame.readSource(Json.decodeFromString(file.readText())) shouldBe expected + + val readOptions = DataFrameReadOptions.Json( + typeClashTactic = JSON.TypeClashTactic.ANY_COLUMNS, + ) + + DataFrame.readSource(file.readText(), readOptions) shouldBe expected + DataFrame.readSource(file.inputStream(), readOptions) shouldBe expected + DataFrame.readSource(Json.decodeFromString(file.readText()), readOptions) shouldBe expected + } + + @Test + fun `read CSV reference`() { + val csvPath = "../data/movies.csv" + val expected = DataFrame.readCsv(csvPath) + + DataFrame.readSource(csvPath) shouldBe expected + DataFrame.readSource(Path(csvPath)) shouldBe expected + DataFrame.readSource(File(csvPath)) shouldBe expected + DataFrame.readSource( + Path(csvPath).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = Csv.ReadOptions(delimiter = ',') + + DataFrame.readSource(csvPath, options) shouldBe expected + DataFrame.readSource(Path(csvPath), options) shouldBe expected + DataFrame.readSource(File(csvPath), options) shouldBe expected + DataFrame.readSource( + Path(csvPath).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read CSV in memory`() { + val file = File("../data/movies.csv") + val expected = DataFrame.readCsv(file) + + // String content has no extension hint, so we pin the format via options. + val options = Csv.ReadOptions(delimiter = ',') + + DataFrame.readSource(file.readText(), options) shouldBe expected + DataFrame.readSource(file.inputStream(), options) shouldBe expected + } + + @Test + fun `read TSV reference`() { + val tsvFile = File("src/test/resources/abc.tsv") + val expected = DataFrame.readTsv(tsvFile) + + DataFrame.readSource(tsvFile.path) shouldBe expected + DataFrame.readSource(Path(tsvFile.path)) shouldBe expected + DataFrame.readSource(tsvFile) shouldBe expected + DataFrame.readSource( + Path(tsvFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = DataFrameReadOptions.Tsv(delimiter = '\t') + + DataFrame.readSource(tsvFile.path, options) shouldBe expected + DataFrame.readSource(Path(tsvFile.path), options) shouldBe expected + DataFrame.readSource(tsvFile, options) shouldBe expected + DataFrame.readSource( + Path(tsvFile.path).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read TSV in memory`() { + val tsvFile = File("src/test/resources/abc.tsv") + val expected = DataFrame.readTsv(tsvFile) + val options = Tsv.ReadOptions(delimiter = '\t') + + // Binary/text without extension — options pin Tsv over Csv/Json/Xlsx. + DataFrame.readSource(tsvFile.readText(), options) shouldBe expected + DataFrame.readSource(tsvFile.inputStream(), options) shouldBe expected + } + + @Test + fun `read XLSX reference`() { + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile) + + DataFrame.readSource(xlsxFile.path) shouldBe expected + DataFrame.readSource(Path(xlsxFile.path)) shouldBe expected + DataFrame.readSource(xlsxFile) shouldBe expected + DataFrame.readSource( + Path(xlsxFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ExcelNEW.ReadOptions(sheetName = "Sheet1") + + DataFrame.readSource(xlsxFile.path, options) shouldBe expected + DataFrame.readSource(Path(xlsxFile.path), options) shouldBe expected + DataFrame.readSource(xlsxFile, options) shouldBe expected + DataFrame.readSource( + Path(xlsxFile.path).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read XLS reference`() { + val xlsFile = File("src/test/resources/sample.xls") + val expected = DataFrame.readExcel(xlsFile) + + DataFrame.readSource(xlsFile.path) shouldBe expected + DataFrame.readSource(Path(xlsFile.path)) shouldBe expected + DataFrame.readSource(xlsFile) shouldBe expected + DataFrame.readSource( + Path(xlsFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + } + + @Test + fun `read XLSX in memory`() { + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile) + + // Workbook and Sheet are exclusive to ExcelNEW, so type-based dispatch works without options. + WorkbookFactory.create(xlsxFile.inputStream()).use { wb -> + DataFrame.readSource(wb) shouldBe expected + DataFrame.readSource(wb.getSheetAt(0)) shouldBe expected + } + + val options = DataFrameReadOptions.Excel() + + // Binary streams have no extension and are accepted by every format, + // so options are needed to pin ExcelNEW for the InputStream variant. + DataFrame.readSource(xlsxFile.inputStream(), options) shouldBe expected + + WorkbookFactory.create(xlsxFile.inputStream()).use { wb -> + DataFrame.readSource(wb, options) shouldBe expected + DataFrame.readSource(wb.getSheetAt(0), options) shouldBe expected + } + } + + @Test + fun `read XLS in memory`() { + val xlsFile = File("src/test/resources/sample.xls") + val expected = DataFrame.readExcel(xlsFile) + + WorkbookFactory.create(xlsFile.inputStream()).use { wb -> + DataFrame.readSource(wb) shouldBe expected + } + DataFrame.readSource(xlsFile.inputStream()) shouldBe expected + WorkbookFactory.create(xlsFile.inputStream()).use { wb -> + DataFrame.readSource(wb) shouldBe expected + } + } + + private fun h2Url(name: String) = "jdbc:h2:mem:$name;DB_CLOSE_DELAY=-1" + + private fun seed(connection: Connection) { + connection.createStatement().use { st -> + st.execute("CREATE TABLE Customer (id INT, name VARCHAR(255), age INT)") + st.execute("INSERT INTO Customer (id, name, age) VALUES (1, 'John', 40), (2, 'Alice', 25), (3, 'Bob', 47)") + } + } + + @Test + fun `read JDBC in memory`() { + val url = h2Url("guess2_inmem") + DriverManager.getConnection(url).use { conn -> + seed(conn) + + val expected = DataFrame.readSqlTable(conn, "Customer") + val tableOpts = Jdbc2.ReadOptions(sqlQueryOrTableName = "Customer") + val queryOpts = Jdbc2.ReadOptions(sqlQueryOrTableName = "SELECT * FROM Customer") + + // Connection — exclusive type, but query/table name must come from options. + DataFrame.readSource(conn, tableOpts) shouldBe expected + DataFrame.readSource(conn, queryOpts) shouldBe expected + + // DbConnectionConfig as InMemory. + val config = DbConnectionConfig(url = url) + DataFrame.readSource(config, tableOpts) shouldBe expected + DataFrame.readSource(config, queryOpts) shouldBe expected + + // DataSource — opens a fresh connection each call (DataSource.readDataFrame closes it via `use`). + val dataSource = object : DataSource { + override fun getConnection() = DriverManager.getConnection(url) + + override fun getConnection(u: String?, p: String?) = DriverManager.getConnection(url) + + override fun getLogWriter() = null + + override fun setLogWriter(out: java.io.PrintWriter?) {} + + override fun setLoginTimeout(seconds: Int) {} + + override fun getLoginTimeout() = 0 + + override fun getParentLogger() = throw UnsupportedOperationException() + + override fun unwrap(iface: Class?): T = throw UnsupportedOperationException() + + override fun isWrapperFor(iface: Class<*>?) = false + } + DataFrame.readSource(dataSource, tableOpts) shouldBe expected + + // ResultSet — no sqlQueryOrTableName needed; just dbType (or a Connection to derive it). + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + DataFrame.readSource( + rs, + Jdbc2.ReadOptions(dbType = H2()), + ) shouldBe expected + } + } + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + DataFrame.readSource( + rs, + Jdbc2.ReadOptions(resultSetConnection = conn), + ) shouldBe expected + } + } + } + } + + @Test + fun `unified readSource auto-detects references vs content`() { + // String that points to an existing file → routed through URL → JSON wins on extension + val jsonExpected = DataFrame.readJson("../data/participants.json") + DataFrame.readSource("../data/participants.json") shouldBe jsonExpected + + // Same idea for CSV/XLSX + val csvExpected = DataFrame.readCsv("../data/movies.csv") + DataFrame.readSource("../data/movies.csv") shouldBe csvExpected + + val xlsxExpected = DataFrame.readExcel(File("src/test/resources/sample2.xlsx")) + DataFrame.readSource("src/test/resources/sample2.xlsx") shouldBe xlsxExpected + + // String that doesn't resolve to a file → treated as raw content (JSON content here) + val file = File("../data/participants.json") + DataFrame.readSource(file.readText()) shouldBe jsonExpected + + // Non-String types: still work, no special handling needed + DataFrame.readSource(file) shouldBe jsonExpected + DataFrame.readSource(Path("../data/participants.json")) shouldBe jsonExpected + } + + @Test + fun `read JDBC reference`() { + val url = h2Url("guess2_ref") + DriverManager.getConnection(url).use { conn -> seed(conn) } + + val config = DbConnectionConfig(url = url) + val expected = DataFrame.readSqlTable(config, "Customer") + val tableOpts = Jdbc2.ReadOptions(sqlQueryOrTableName = "Customer") + + DataFrame.readSource(config, tableOpts) shouldBe expected + DataFrame.readSource(config, Jdbc2.ReadOptions(sqlQueryOrTableName = "SELECT * FROM Customer")) shouldBe expected + } + + @Test + fun `read schema via default fallback (file-based formats)`() { + // JSON + val jsonExpected = DataFrame.readJson("../data/participants.json").schema() + DataFrameSchema.readSource( + File("../data/participants.json"), + ) shouldBe jsonExpected + DataFrameSchema.readSource( + "../data/participants.json", + ) shouldBe jsonExpected + + // CSV + val csvExpected = DataFrame.readCsv("../data/movies.csv").schema() + DataFrameSchema.readSource( + File("../data/movies.csv"), + ) shouldBe csvExpected + + // TSV + val tsvFile = File("src/test/resources/abc.tsv") + val tsvExpected = DataFrame.readTsv(tsvFile).schema() + DataFrameSchema.readSource(tsvFile) shouldBe tsvExpected + + // XLSX + val xlsxFile = File("src/test/resources/sample2.xlsx") + val xlsxExpected = DataFrame.readExcel(xlsxFile).schema() + DataFrameSchema.readSource(xlsxFile) shouldBe xlsxExpected + } + + @Test + fun `read JDBC schema via override`() { + val url = h2Url("guess2_schema") + DriverManager.getConnection(url).use { conn -> + seed(conn) + val expected = DataFrameSchema.readSqlTable(conn, "Customer") + val tableOpts = Jdbc2.ReadOptions(sqlQueryOrTableName = "Customer") + val queryOpts = Jdbc2.ReadOptions(sqlQueryOrTableName = "SELECT * FROM Customer") + + DataFrameSchema.readSource(conn, tableOpts) shouldBe expected + DataFrameSchema.readSource(conn, queryOpts) shouldBe expected + + val config = DbConnectionConfig(url = url) + DataFrameSchema.readSource(config, tableOpts) shouldBe expected + } + } + + @Test + fun `read JDBC schema from ResultSet does not advance cursor`() { + val url = h2Url("guess2_rs_schema") + DriverManager.getConnection(url).use { conn -> + seed(conn) + + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + // Schema-from-ResultSet uses JDBC metadata only — no rows are fetched, so the + // cursor stays at "before first row". (And nullability comes from the column metadata, + // which is conservatively nullable for columns without NOT NULL constraints; this is + // why we don't compare against the data-inferred schema directly.) + val expected = DataFrameSchema.readResultSet( + conn.prepareStatement("SELECT * FROM Customer").executeQuery(), + H2(), + ) + val schema = DataFrameSchema.readSource(rs, Jdbc2.ReadOptions(dbType = H2())) + schema shouldBe expected + rs.isBeforeFirst shouldBe true + } + } + } + } + + @Test + fun `read Arrow Feather reference`() { + val featherFile = File("src/test/resources/test.feather") + val expected = DataFrame.readArrowFeather(featherFile) + + DataFrame.readSource(featherFile.path) shouldBe expected + DataFrame.readSource(Path(featherFile.path)) shouldBe expected + DataFrame.readSource(featherFile) shouldBe expected + DataFrame.readSource( + Path(featherFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ArrowFeatherNEW.ReadOptions() + + DataFrame.readSource(featherFile.path, options) shouldBe expected + DataFrame.readSource(featherFile, options) shouldBe expected + } + + @Test + fun `read Arrow Feather in memory`() { + val featherFile = File("src/test/resources/test.feather") + val expected = DataFrame.readArrowFeather(featherFile) + val options = ArrowFeatherNEW.ReadOptions() + + // ByteArray, InputStream, SeekableByteChannel all need options to disambiguate (no extension). + DataFrame.readSource(featherFile.readBytes(), options) shouldBe expected + DataFrame.readSource(featherFile.inputStream(), options) shouldBe expected + java.nio.file.Files.newByteChannel(featherFile.toPath()).use { channel -> + DataFrame.readSource(channel, options) shouldBe expected + } + } + + @Test + fun `read Arrow IPC reference`() { + val ipcFile = File("src/test/resources/test.arrow") + val expected = DataFrame.readArrowIPC(ipcFile) + + DataFrame.readSource(ipcFile.path) shouldBe expected + DataFrame.readSource(Path(ipcFile.path)) shouldBe expected + DataFrame.readSource(ipcFile) shouldBe expected + DataFrame.readSource( + Path(ipcFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ArrowIPC.ReadOptions() + DataFrame.readSource(ipcFile, options) shouldBe expected + } + + @Test + fun `read Parquet reference`() { + val parquetFile = File("src/test/resources/test.parquet") + val expected = DataFrame.readParquet(parquetFile) + + DataFrame.readSource(parquetFile.path) shouldBe expected + DataFrame.readSource(Path(parquetFile.path)) shouldBe expected + DataFrame.readSource(parquetFile) shouldBe expected + DataFrame.readSource( + Path(parquetFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = Parquet.ReadOptions() + DataFrame.readSource(parquetFile, options) shouldBe expected + } + + @Test + fun `read OpenAPI yaml as code`() { + val openApiFile = File("src/test/resources/petstore.yaml") + + // The reference call from the existing helper, used as the ground truth. + val expected = readOpenApiAsString( + openApiAsString = openApiFile.readText(), + name = "Petstore", + extensionProperties = false, + generateHelperCompanionObject = false, + ) + + // String path / File / Path / URL all route through readSourceImpl to OpenApi2. + CodeString.readSource(openApiFile.path, name = "Petstore").value shouldBe expected + CodeString.readSource(openApiFile, name = "Petstore").value shouldBe expected + CodeString.readSource(Path(openApiFile.path), name = "Petstore").value shouldBe expected + CodeString.readSource( + Path(openApiFile.path).absolute().normalize().toUri().toURL(), + name = "Petstore", + ).value shouldBe expected + + // String content path (raw spec text) also works. + CodeString.readSource(openApiFile.readText(), name = "Petstore").value shouldBe expected + } + + @Test + fun `OpenAPI does not steal plain JSON DataFrame reads`() { + // A regular JSON file (not an OpenAPI spec) still goes to Json, even though OpenApi2 runs first. + // OpenApi2.readDataSchemaCode returns a failed Result for non-OpenAPI content, but more importantly + // OpenApi2.readDataFrame returns a failed Result, so DataFrame reads fall through. + val expected = DataFrame.readJson("../data/participants.json") + DataFrame.readSource(File("../data/participants.json")) shouldBe expected + } + + @Test + fun `default DataSchema code generation works for JSON via interface default`() { + // The interface default reads the schema and calls generateInterfaces — exercise it on a JSON file. + val jsonFile = File("../data/participants.json") + val schemaCode = CodeString.readSource(jsonFile, name = "Participants") + // The output is non-empty and includes the marker name. + schemaCode.value shouldContain "Participants" + } + + // region DataRow.readSource — single-row inputs across formats + + @Test + fun `read DataRow from CSV string`() { + val csvText = "a,b,c\n1,2,3" + val expected = DataFrame.readCsvStr(csvText).single() + DataRow.readSource(csvText, Csv.ReadOptions()) shouldBe expected + } + + @Test + fun `read DataRow from TSV string`() { + val tsvText = "a\tb\tc\n1\t2\t3" + val expected = DataFrame.readTsvStr(tsvText).single() + DataRow.readSource(tsvText, Tsv.ReadOptions()) shouldBe expected + } + + @Test + fun `read DataRow from JSON string`() { + // A single-element JSON array yields a one-row DataFrame. + val jsonText = """[{"a": 1, "b": 2}]""" + val expected = DataFrame.readJsonStr(jsonText).single() + DataRow.readSource(jsonText) shouldBe expected + } + + @Test + fun `read DataRow from single-row XLSX file`() { + // sample2.xlsx has exactly one data row. + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile).single() + DataRow.readSource(xlsxFile) shouldBe expected + } + + @Test + fun `read DataRow from JDBC with single-row query`() { + val url = h2Url("guess2_datarow") + DriverManager.getConnection(url).use { conn -> + seed(conn) + val query = "SELECT * FROM Customer WHERE id = 1" + val expected = DataFrame.readSqlQuery(conn, query).single() + DataRow.readSource(conn, Jdbc2.ReadOptions(sqlQueryOrTableName = query)) shouldBe expected + } + } + + @Test + fun `read DataRow throws when source has multiple rows`() { + // movies.csv has many rows — DataRow.single() should fail, surfaced as the framework's + // "Unknown DataRow source" since the exception is caught and converted. + val movies = File("../data/movies.csv") + try { + DataRow.readSource(movies) + error("Expected DataRow.readSource to fail on a multi-row CSV") + } catch (_: IllegalArgumentException) { + // expected + } + } + + // endregion + + // region convert API integration — convert { col }.to() + // + // Frame columns are typed by their schema, so each column being converted must contain sources of the + // same shape. Mixing, say, a CSV-shaped source and a JSON-shaped source in the same column would yield + // a FrameColumn with no coherent single schema — these tests keep each column homogeneous and put + // differently-shaped sources into separate columns. + + @Test + fun `convert column of CSV files to DataFrame`() { + // Two cells, both pointing at the same CSV → uniform shape in the resulting FrameColumn. + val csvFile = File("../data/movies.csv") + val df = dataFrameOf("source")(csvFile, csvFile) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readCsv(csvFile) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of CSV files to DataFrameSchema`() { + val csvFile = File("../data/movies.csv") + val df = dataFrameOf("source")(csvFile, csvFile) + + val converted = df.convert("source").to() + + val expected = DataFrame.readCsv(csvFile).schema() + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of single-row XLSX files to DataRow`() { + // sample2.xlsx has exactly one data row, so .to>() works for each cell. + val xlsxFile = File("src/test/resources/sample2.xlsx") + val df = dataFrameOf("source")(xlsxFile, xlsxFile) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readExcel(xlsxFile).single() + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of String content to DataFrame`() { + // Multiple parallel JSON content strings (same shape) → uniform FrameColumn. + val text = """[{"a": 1, "b": 2}]""" + val df = dataFrameOf("source")(text, text) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readJsonStr(text) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert two homogeneous source columns at once`() { + // Each column is internally uniform: csvCol has CSV-shaped cells, jsonCol has JSON-shaped cells. + // The result is two FrameColumns, each with its own coherent schema. + val csvFile = File("../data/movies.csv") + val jsonFile = File("../data/participants.json") + val df = dataFrameOf("csvCol", "jsonCol")(csvFile, jsonFile, csvFile, jsonFile) + + val converted = df.convert("csvCol", "jsonCol").to>() + + val expectedCsv = DataFrame.readCsv(csvFile) + val expectedJson = DataFrame.readJson(jsonFile) + converted["csvCol"][0] shouldBe expectedCsv + converted["csvCol"][1] shouldBe expectedCsv + converted["jsonCol"][0] shouldBe expectedJson + converted["jsonCol"][1] shouldBe expectedJson + } + + @Test + fun `convert column of URLs to DataFrame`() { + // Two URLs pointing at the same JSON file → uniform schema in the FrameColumn. + val jsonUrl = File("../data/participants.json").toURI().toURL() + val urls = columnOf(jsonUrl, jsonUrl) named "source" + val df = urls.toDataFrame() + + val converted = df.convert("source").to>() + val expected = DataFrame.readJson(jsonUrl) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + // endregion + + // region DataFrame.write / DataRow.write — write to various JSON targets + + @Test + fun `write DataFrame as JSON to Path`() { + val df = DataFrame.readJson("../data/participants.json") + val tempPath = Files.createTempFile("guess2-write-df", ".json") + .also { it.toFile().deleteOnExit() } + df.write(tempPath) + DataFrame.readJson(tempPath) shouldBe df + } + + @Test + fun `write DataFrame as JSON to File`() { + val df = DataFrame.readJson("../data/participants.json") + val tempFile = Files.createTempFile("guess2-write-df", ".json").toFile() + .also { it.deleteOnExit() } + df.write(tempFile) + DataFrame.readJson(tempFile) shouldBe df + } + + @Test + fun `write DataFrame as JSON to String pointing at existing file`() { + // doStringToPathConversion in writeTargetImpl only fires when the path already exists; + // createTempFile creates the file, so the String → Path routing kicks in. + val df = DataFrame.readJson("../data/participants.json") + val tempFile = Files.createTempFile("guess2-write-df-str", ".json").toFile() + .also { it.deleteOnExit() } + df.write(tempFile.path) + DataFrame.readJson(tempFile) shouldBe df + } + + @Test + fun `write DataFrame as JSON to Appendable`() { + val df = DataFrame.readJson("../data/participants.json") + val sb = StringBuilder() + // StringBuilder is reified — pin Appendable so the framework dispatches to that branch. + df.write(sb) + DataFrame.readJsonStr(sb.toString()) shouldBe df + } + + @Test + fun `write DataFrame as JSON to OutputStream`() { + val df = DataFrame.readJson("../data/participants.json") + val baos = ByteArrayOutputStream() + df.write(baos) + DataFrame.readJsonStr(baos.toString()) shouldBe df + } + + @Test + fun `write DataFrame as JSON to Function1 of JsonArray`() { + val df = DataFrame.readJson("../data/participants.json") + var captured: JsonArray? = null + df.write({ it: JsonArray -> captured = it }) + captured shouldBe df.toJsonElement() + } + + @Test + fun `write DataFrame as JSON to Function1 of String`() { + val df = DataFrame.readJson("../data/participants.json") + var captured: String? = null + df.write({ it: String -> captured = it }) + captured shouldBe df.toJson() + } + + @Test + fun `write DataFrame as JSON to Function1 of JsonObject fails`() { + // A DataFrame can only be converted to a JsonArray, not a JsonObject. + val df = DataFrame.readJson("../data/participants.json") + shouldThrow { df.write({ _: JsonObject -> }) } + } + + @Test + fun `write DataRow as JSON to Path`() { + val row = DataFrame.readJsonStr("""[{"a": 1, "b": "x"}]""").single() + val tempPath = Files.createTempFile("guess2-write-row", ".json") + .also { it.toFile().deleteOnExit() } + row.write(tempPath) + DataRow.readJson(tempPath) shouldBe row + } + + @Test + fun `write DataRow as JSON to File`() { + val row = DataFrame.readJsonStr("""[{"a": 1, "b": "x"}]""").single() + val tempFile = Files.createTempFile("guess2-write-row", ".json").toFile() + .also { it.deleteOnExit() } + row.write(tempFile) + DataRow.readJson(tempFile) shouldBe row + } + + @Test + fun `write DataRow as JSON to Appendable`() { + val row = DataFrame.readJsonStr("""[{"a": 1, "b": "x"}]""").single() + val sb = StringBuilder() + row.write(sb) + sb.toString() shouldBe row.toJson() + } + + @Test + fun `write DataRow as JSON to OutputStream`() { + val row = DataFrame.readJsonStr("""[{"a": 1, "b": "x"}]""").single() + val baos = ByteArrayOutputStream() + row.write(baos) + baos.toString() shouldBe row.toJson() + } + + @Test + fun `write DataRow as JSON to Function1 of JsonObject`() { + val row = DataFrame.readJsonStr("""[{"a": 1, "b": "x"}]""").single() + var captured: JsonObject? = null + row.write({ it: JsonObject -> captured = it }) + captured shouldBe row.toJsonElement() + } + + @Test + fun `write DataRow as JSON to Function1 of String`() { + val row = DataFrame.readJsonStr("""[{"a": 1, "b": "x"}]""").single() + var captured: String? = null + row.write({ it: String -> captured = it }) + captured shouldBe row.toJson() + } + + @Test + fun `write DataRow as JSON to Function1 of JsonArray fails`() { + // A single DataRow can only be turned into a JsonObject, not a JsonArray. + val row = DataFrame.readJsonStr("""[{"a": 1, "b": "x"}]""").single() + shouldThrow { row.write({ _: JsonArray -> }) } + } + + @Test + fun `write DataFrame as JSON with prettyPrint option produces multi-line output`() { + val df = DataFrame.readJsonStr("""[{"a": 1, "b": "x"}]""") + val sb = StringBuilder() + df.write(sb, WriteOptions(prettyPrint = true)) + sb.toString() shouldContain "\n" + DataFrame.readJsonStr(sb.toString()) shouldBe df + } + + @Test + fun `write DataFrame with unsupported target type fails`() { + // Int is not a supported writing type for any registered format → no format accepts it, + // and writeTargetImpl reports "Failed to find a suitable format". + val df = DataFrame.readJsonStr("""[{"a": 1}]""") + shouldThrow { df.write(42) } + } + + // endregion +} diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 5900956fcf..3ba90f39d1 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -464,7 +464,9 @@ class ParserTests { @Test fun `Mixing null and json`() { val col by columnOf("[\"str\"]", "[]", "null") - val parsed = col.parse() + val parsed = col.parse( + ParserOptions(parseToDataFrameReadSource = true), + ) parsed.type() shouldBe typeOf() parsed.kind() shouldBe ColumnKind.Frame require(parsed.isFrameColumn()) diff --git a/core/src/test/resources/petstore.yaml b/core/src/test/resources/petstore.yaml new file mode 100644 index 0000000000..89255de8e9 --- /dev/null +++ b/core/src/test/resources/petstore.yaml @@ -0,0 +1,304 @@ +# DEMO for DataFrame, this might differ from the actual API (it's updated a bit) +openapi: 3.0.0 +info: + version: 2.0.2 + title: APIs.guru + description: > + Wikipedia for Web APIs. Repository of API specs in OpenAPI format. + + + **Warning**: If you want to be notified about changes in advance please join our [Slack channel](https://join.slack.com/t/mermade/shared_invite/zt-g78g7xir-MLE_CTCcXCdfJfG3CJe9qA). + + + Client sample: [[Demo]](https://apis.guru/simple-ui) [[Repo]](https://github.com/APIs-guru/simple-ui) + contact: + name: APIs.guru + url: https://APIs.guru + email: mike.ralphson@gmail.com + license: + name: CC0 1.0 + url: https://github.com/APIs-guru/openapi-directory#licenses + x-logo: + url: https://apis.guru/branding/logo_vertical.svg +externalDocs: + url: https://github.com/APIs-guru/openapi-directory/blob/master/API.md +security: [ ] +tags: + - name: APIs + description: Actions relating to APIs in the collection +paths: + /list.json: + get: + operationId: listAPIs + tags: + - APIs + summary: List all APIs + description: > + List all APIs in the directory. + + Returns links to OpenAPI specification for each API in the directory. + + If API exist in multiple versions `preferred` one is explicitly marked. + + + Some basic info from OpenAPI spec is cached inside each object. + + This allows to generate some simple views without need to fetch OpenAPI spec for each API. + responses: + "200": + description: OK + content: + application/json; charset=utf-8: + schema: + $ref: "#/components/schemas/APIs" + application/json: + schema: + $ref: "#/components/schemas/APIs" + /metrics.json: + get: + operationId: getMetrics + summary: Get basic metrics + description: > + Some basic metrics for the entire directory. + + Just stunning numbers to put on a front page and are intended purely for WoW effect :) + tags: + - APIs + responses: + "200": + description: OK + content: + application/json; charset=utf-8: + schema: + $ref: "#/components/schemas/Metrics" + application/json: + schema: + $ref: "#/components/schemas/Metrics" +components: + schemas: + APIs: + description: | + List of API details. + It is a JSON object with API IDs(`[:]`) as keys. + type: object + additionalProperties: + $ref: "#/components/schemas/API" + minProperties: 1 + example: + googleapis.com:drive: + added: 2015-02-22T20:00:45.000Z + preferred: v3 + versions: + v2: + added: 2015-02-22T20:00:45.000Z + info: + title: Drive + version: v2 + x-apiClientRegistration: + url: https://console.developers.google.com + x-logo: + url: https://api.apis.guru/v2/cache/logo/https_www.gstatic.com_images_icons_material_product_2x_drive_32dp.png + x-origin: + format: google + url: https://www.googleapis.com/discovery/v1/apis/drive/v2/rest + version: v1 + x-preferred: false + x-providerName: googleapis.com + x-serviceName: drive + swaggerUrl: https://api.apis.guru/v2/specs/googleapis.com/drive/v2/swagger.json + swaggerYamlUrl: https://api.apis.guru/v2/specs/googleapis.com/drive/v2/swagger.yaml + updated: 2016-06-17T00:21:44.000Z + v3: + added: 2015-12-12T00:25:13.000Z + info: + title: Drive + version: v3 + x-apiClientRegistration: + url: https://console.developers.google.com + x-logo: + url: https://api.apis.guru/v2/cache/logo/https_www.gstatic.com_images_icons_material_product_2x_drive_32dp.png + x-origin: + format: google + url: https://www.googleapis.com/discovery/v1/apis/drive/v3/rest + version: v1 + x-preferred: true + x-providerName: googleapis.com + x-serviceName: drive + swaggerUrl: https://api.apis.guru/v2/specs/googleapis.com/drive/v3/swagger.json + swaggerYamlUrl: https://api.apis.guru/v2/specs/googleapis.com/drive/v3/swagger.yaml + updated: 2016-06-17T00:21:44.000Z + API: + description: Meta information about API + type: object + required: + - added + - preferred + - versions + properties: + added: + description: Timestamp when the API was first added to the directory + type: string + format: date-time + preferred: + description: Recommended version + type: string + versions: + description: List of supported versions of the API + type: object + additionalProperties: + $ref: "#/components/schemas/ApiVersion" + minProperties: 1 + additionalProperties: false + ApiVersion: + type: object + required: + - added + # - updated apparently not required! + - swaggerUrl + - swaggerYamlUrl + - info + - openapiVer + properties: + added: + description: Timestamp when the version was added + type: string + format: date-time + updated: # apparently not required! + description: Timestamp when the version was updated + type: string + format: date-time + swaggerUrl: + description: URL to OpenAPI definition in JSON format + type: string + format: url + swaggerYamlUrl: + description: URL to OpenAPI definition in YAML format + type: string + format: url + info: + description: Copy of `info` section from OpenAPI definition + type: object + minProperties: 1 + externalDocs: + description: Copy of `externalDocs` section from OpenAPI definition + type: object + minProperties: 1 + openapiVer: + description: OpenAPI version + type: string + additionalProperties: false + + Metrics: + description: List of basic metrics + type: object + required: + - numSpecs + - numAPIs + - numEndpoints + - unreachable + - invalid + - unofficial + - fixes + - fixedPct + - datasets + - stars + - issues + - thisWeek + properties: + numSpecs: + description: Number of API specifications including different versions of the + same API + type: integer + minimum: 1 + numAPIs: + description: Number of APIs + type: integer + minimum: 1 + numEndpoints: + description: Total number of endpoints inside all specifications + type: integer + minimum: 1 + unreachable: + description: Number of unreachable specifications + type: integer + minimum: 0 + invalid: + description: Number of invalid specifications + type: integer + minimum: 0 + unofficial: + description: Number of unofficial specifications + type: integer + minimum: 0 + fixes: + description: Number of fixes applied to specifications + type: integer + minimum: 0 + fixedPct: + description: Percentage of fixed specifications + type: number + minimum: 0 + maximum: 100 + datasets: + description: An overview of the datasets used to gather the APIs + type: array + items: + description: A single metric per dataset + type: object + required: + - title + - data + properties: + title: + description: Title of the metric + type: string + data: + description: Value of the metric per dataset + type: object + additionalProperties: + type: integer + minimum: 0 + stars: + description: Number of stars on GitHub + type: integer + minimum: 0 + issues: + description: Number of issues on GitHub + type: integer + minimum: 0 + thisWeek: + description: Number of new specifications added/updated this week + type: object + required: + - added + - updated + properties: + added: + description: Number of new specifications added this week + type: integer + minimum: 0 + updated: + description: Number of specifications updated this week + type: integer + minimum: 0 + additionalProperties: false + example: + numSpecs: 1000 + numAPIs: 100 + numEndpoints: 10000 + unreachable: 10 + invalid: 10 + unofficial: 10 + fixes: 10 + fixedPct: 10 + datasets: + - title: providerCount + data: + "a.com": 10 + "b.com": 20 + "c.com": 30 + stars: 1000 + issues: 100 + thisWeek: + added: 10 + updated: 10 diff --git a/core/src/test/resources/sample.xls b/core/src/test/resources/sample.xls new file mode 100644 index 0000000000..fcdced4e34 Binary files /dev/null and b/core/src/test/resources/sample.xls differ diff --git a/core/src/test/resources/sample2.xlsx b/core/src/test/resources/sample2.xlsx new file mode 100644 index 0000000000..db7ef6786d Binary files /dev/null and b/core/src/test/resources/sample2.xlsx differ diff --git a/core/src/test/resources/test.arrow b/core/src/test/resources/test.arrow new file mode 100644 index 0000000000..61e8c31afa Binary files /dev/null and b/core/src/test/resources/test.arrow differ diff --git a/core/src/test/resources/test.feather b/core/src/test/resources/test.feather new file mode 100644 index 0000000000..4a348d1e2d Binary files /dev/null and b/core/src/test/resources/test.feather differ diff --git a/core/src/test/resources/test.parquet b/core/src/test/resources/test.parquet new file mode 100644 index 0000000000..cf78b1c255 Binary files /dev/null and b/core/src/test/resources/test.parquet differ diff --git a/dataframe-arrow/api/dataframe-arrow.api b/dataframe-arrow/api/dataframe-arrow.api index c3f0a80c3b..ffc8fe35d5 100644 --- a/dataframe-arrow/api/dataframe-arrow.api +++ b/dataframe-arrow/api/dataframe-arrow.api @@ -9,6 +9,64 @@ public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeather : org/jetbrai public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)V + public synthetic fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun copy (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$ReadOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$ReadOptions;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)V + public synthetic fun (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/apache/arrow/memory/RootAllocator; + public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun copy (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$ReadOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$ReadOptions;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllocator ()Lorg/apache/arrow/memory/RootAllocator; + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/ArrowReadingKt { public static final fun readArrow (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static synthetic fun readArrow$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; @@ -323,3 +381,33 @@ public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$Widenin public fun toString ()Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/Parquet : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Parquet$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Parquet$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)V + public synthetic fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun component2 ()J + public final fun copy (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/io/Parquet$ReadOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Parquet$ReadOptions;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Parquet$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getBatchSize ()J + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt index 5cf884d73d..5dfe90ad21 100644 --- a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt +++ b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt @@ -18,6 +18,9 @@ import java.nio.channels.ReadableByteChannel import java.nio.channels.SeekableByteChannel import java.nio.file.Files import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf public class ArrowFeather : SupportedDataFrameFormat { override fun readDataFrame(stream: InputStream, header: List): AnyFrame = @@ -36,6 +39,298 @@ public class ArrowFeather : SupportedDataFrameFormat { DefaultReadArrowMethod(pathRepresentation) } +/** + * [DataFrameReadSource] for [Arrow Feather files][DataFrame.readArrowFeather] (random-access IPC format). + * + * Supported source types: + * - References: [URL], [Path], [File] + * - In-memory: [SeekableByteChannel], [ByteArray], [InputStream], [ArrowReader] + * + * Default-accepts the `.feather` extension. To read with no extension hint (e.g., an [InputStream]) pass + * an [ReadOptions] instance to disambiguate from text formats. + */ +public class ArrowFeatherNEW : DataFrameReadSource { + + public data class ReadOptions(val nullability: NullabilityOptions) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + nullability: NullabilityOptions = NullabilityOptions.Infer, + ): ReadOptions = + ReadOptions( + nullability = nullability, + ) + } + } + + override val supportedReadingTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + public companion object { + internal const val EXTENSION: String = "feather" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + val kType = sourceInfo.kType + + // ArrowReader is exclusive; check before more general types. + if (kType.isSubTypeOf()) { + return@runCatching DataFrame.readArrow(source as ArrowReader, opts.nullability) + } + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return@runCatching DataFrame.readArrowFeather(url, opts.nullability) + } + + return@runCatching when { + kType.isSubTypeOf() -> + DataFrame.readArrowFeather(source as SeekableByteChannel, nullability = opts.nullability) + + kType.isSubTypeOf() -> + DataFrame.readArrowFeather(source as ByteArray, opts.nullability) + + kType.isSubTypeOf() -> + DataFrame.readArrowFeather(source as InputStream, opts.nullability) + + else -> { + // return the exception without throwing it; cheaper + @Suppress("RedundantReturnKeyword") + return Result.failure(IllegalStateException("Cannot read source of type $kType as Arrow Feather")) + } + } + } + + override val testOrder: Int = 60_000 + + override fun toString(): String = "ArrowFeather" +} + +public val DataFrameReadOptions.Companion.ArrowFeather: + org.jetbrains.kotlinx.dataframe.io.ArrowFeatherNEW.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.ArrowFeatherNEW.ReadOptions.Companion + +/** + * [DataFrameReadSource] for [Arrow IPC streaming files][DataFrame.readArrowIPC]. + * + * Supported source types: + * - References: [URL], [Path], [File] + * - In-memory: [InputStream], [ByteArray], [ReadableByteChannel], [ArrowReader] + * + * There's no widely-standardized extension for IPC streaming files (`.arrow` is most common but is also + * used for random-access Feather), so this format accepts the `.arrow` extension. If your `.arrow` file is + * actually random-access (Feather), prefer [ArrowFeatherNEW] — both formats will match `.arrow`, but + * [ArrowFeatherNEW] runs first by [testOrder] and a Feather read of a streaming-format file will throw, + * letting the framework fall through to [ArrowIPC]. + */ +public class ArrowIPC : DataFrameReadSource { + + public data class ReadOptions( + val allocator: RootAllocator, + val nullability: NullabilityOptions, + ) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + allocator: RootAllocator = Allocator.ROOT, + nullability: NullabilityOptions = NullabilityOptions.Infer, + ): ReadOptions = + ReadOptions( + allocator = allocator, + nullability = nullability, + ) + } + } + + override val supportedReadingTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + public companion object { + internal const val EXTENSION: String = "arrow" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + val kType = sourceInfo.kType + + if (kType.isSubTypeOf()) { + return@runCatching DataFrame.readArrow(source as ArrowReader, opts.nullability) + } + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return@runCatching DataFrame.readArrowIPC(url, opts.nullability) + } + + return@runCatching when { + kType.isSubTypeOf() -> + DataFrame.readArrowIPC(source as ReadableByteChannel, opts.allocator, opts.nullability) + + kType.isSubTypeOf() -> + DataFrame.readArrowIPC(source as ByteArray, opts.nullability) + + kType.isSubTypeOf() -> + DataFrame.readArrowIPC(source as InputStream, opts.nullability) + + else -> { + // return the exception without throwing it; cheaper + @Suppress("RedundantReturnKeyword") + return Result.failure(IllegalStateException("Cannot read source of type $kType as Arrow IPC")) + } + } + } + + // Runs after ArrowFeatherNEW so that `.feather` files get the random-access reader first. + // Both accept `.arrow`; if Feather reading throws on an IPC streaming file the framework falls + // through to here. + override val testOrder: Int = 60_100 + + override fun toString(): String = "ArrowIPC" +} + +public val DataFrameReadOptions.Companion.ArrowIPC: org.jetbrains.kotlinx.dataframe.io.ArrowIPC.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.ArrowIPC.ReadOptions.Companion + +/** + * [DataFrameReadSource] for Apache Parquet files (read via Arrow Dataset). + * + * Arrow Dataset only consumes URIs, so only reference-style sources are supported: + * - References: [URL], [Path], [File] + * + * TODO? Multi-file Parquet datasets (vararg in [DataFrame.readParquet]) aren't covered by this single-source API; + * use [DataFrame.readParquet] directly for those. + */ +public class Parquet : DataFrameReadSource { + + public data class ReadOptions( + val nullability: NullabilityOptions, + val batchSize: Long, + ) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + nullability: NullabilityOptions = NullabilityOptions.Infer, + batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE, + ): ReadOptions = + ReadOptions( + nullability = nullability, + batchSize = batchSize, + ) + } + } + + override val supportedReadingTypes: Set = + setOf(typeOf(), typeOf(), typeOf()) + + public companion object { + internal const val EXTENSION: String = "parquet" + internal val MIME_TYPES = setOf( + "application/x-parquet", + "application/parquet", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + val kType = sourceInfo.kType + return@runCatching when { + kType.isSubTypeOf() -> + DataFrame.readParquet( + source as URL, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + + kType.isSubTypeOf() -> + DataFrame.readParquet( + source as Path, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + + kType.isSubTypeOf() -> + DataFrame.readParquet( + source as File, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + + else -> { + // return the exception without throwing it; cheaper + @Suppress("RedundantReturnKeyword") + return Result.failure(IllegalStateException("Cannot read source of type $kType as Parquet")) + } + } + } + + override val testOrder: Int = 60_500 + + override fun toString(): String = "Parquet" +} + +public val DataFrameReadOptions.Companion.Parquet: org.jetbrains.kotlinx.dataframe.io.Parquet.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.Parquet.ReadOptions.Companion + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_ARROW_FEATHER = "readArrowFeather" internal const val ARROW_PARQUET_DEFAULT_BATCH_SIZE = 32768L diff --git a/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..ef8466f22f --- /dev/null +++ b/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1,3 @@ +org.jetbrains.kotlinx.dataframe.io.ArrowIPC +org.jetbrains.kotlinx.dataframe.io.ArrowFeatherNEW +org.jetbrains.kotlinx.dataframe.io.Parquet diff --git a/dataframe-csv/api/dataframe-csv.api b/dataframe-csv/api/dataframe-csv.api index a9a964783c..978124af85 100644 --- a/dataframe-csv/api/dataframe-csv.api +++ b/dataframe-csv/api/dataframe-csv.api @@ -1,3 +1,57 @@ +public final class org/jetbrains/kotlinx/dataframe/io/Csv : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Csv$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Csv$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Csv$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)V + public synthetic fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()C + public final fun component10 ()Z + public final fun component11 ()C + public final fun component12 ()Z + public final fun component13 ()Z + public final fun component14 ()Z + public final fun component2 ()Ljava/util/List; + public final fun component3 ()Ljava/nio/charset/Charset; + public final fun component4 ()Ljava/util/Map; + public final fun component5 ()J + public final fun component6 ()Ljava/lang/Long; + public final fun component7 ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun component8 ()Z + public final fun component9 ()Z + public final fun copy (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)Lorg/jetbrains/kotlinx/dataframe/io/Csv$ReadOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Csv$ReadOptions;CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Csv$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllowMissingColumns ()Z + public final fun getCharset ()Ljava/nio/charset/Charset; + public final fun getColTypes ()Ljava/util/Map; + public final fun getDelimiter ()C + public final fun getHeader ()Ljava/util/List; + public final fun getIgnoreEmptyLines ()Z + public final fun getIgnoreExcessColumns ()Z + public final fun getIgnoreSurroundingSpaces ()Z + public final fun getParseParallel ()Z + public final fun getParserOptions ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun getQuote ()C + public final fun getReadLines ()Ljava/lang/Long; + public final fun getSkipLines ()J + public final fun getTrimInsideQuoted ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/CsvDeephaven : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat { public fun ()V public fun (C)V @@ -121,6 +175,60 @@ public final class org/jetbrains/kotlinx/dataframe/io/ToTsvStrKt { public static synthetic fun toTsvStr$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;ZCLjava/lang/Character;Lorg/jetbrains/kotlinx/dataframe/io/QuoteMode;Ljava/lang/Character;Ljava/lang/Character;Ljava/util/List;Ljava/lang/String;ILjava/lang/Object;)Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/Tsv : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Tsv$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Tsv$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)V + public synthetic fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()C + public final fun component10 ()Z + public final fun component11 ()C + public final fun component12 ()Z + public final fun component13 ()Z + public final fun component14 ()Z + public final fun component2 ()Ljava/util/List; + public final fun component3 ()Ljava/nio/charset/Charset; + public final fun component4 ()Ljava/util/Map; + public final fun component5 ()J + public final fun component6 ()Ljava/lang/Long; + public final fun component7 ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun component8 ()Z + public final fun component9 ()Z + public final fun copy (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)Lorg/jetbrains/kotlinx/dataframe/io/Tsv$ReadOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Tsv$ReadOptions;CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Tsv$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllowMissingColumns ()Z + public final fun getCharset ()Ljava/nio/charset/Charset; + public final fun getColTypes ()Ljava/util/Map; + public final fun getDelimiter ()C + public final fun getHeader ()Ljava/util/List; + public final fun getIgnoreEmptyLines ()Z + public final fun getIgnoreExcessColumns ()Z + public final fun getIgnoreSurroundingSpaces ()Z + public final fun getParseParallel ()Z + public final fun getParserOptions ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun getQuote ()C + public final fun getReadLines ()Ljava/lang/Long; + public final fun getSkipLines ()J + public final fun getTrimInsideQuoted ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/TsvDeephaven : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat { public fun ()V public fun (C)V diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt index 34bf7aab0d..cd9230b610 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt @@ -231,8 +231,16 @@ private fun CsvReader.ResultColumn.toDataColumn( val givenSkipTypes = parserOptions?.skipTypes ?: DataFrame.parser.skipTypes // no need to check for types that Deephaven already parses, skip those too val adjustedSkipTypes = givenSkipTypes + typesDeephavenAlreadyParses + + val parseDfReadSource = parserOptions?.parseToDataFrameReadSource + ?: DataFrame.parser.parseToDataFrameReadSource.takeIf { it } // only take if adjusted to 'true' + ?: true + val adjustedParserOptions = (parserOptions ?: ParserOptions()) - .copy(skipTypes = adjustedSkipTypes) + .copy( + skipTypes = adjustedSkipTypes, + parseToDataFrameReadSource = parseDfReadSource, + ) column.tryParse(adjustedParserOptions) } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index 801fd3a717..470062731c 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -3,12 +3,17 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.charset.Charset import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.typeOf public class CsvDeephaven(private val delimiter: Char = DelimParams.CSV_DELIMITER) : SupportedDataFrameFormat { @@ -33,6 +38,176 @@ public class CsvDeephaven(private val delimiter: Char = DelimParams.CSV_DELIMITE } } +public class Csv : DataFrameReadSource { + + public data class ReadOptions( + val delimiter: Char, + val header: List, + val charset: Charset?, + val colTypes: Map, + val skipLines: Long, + val readLines: Long?, + val parserOptions: ParserOptions?, + val ignoreEmptyLines: Boolean, + val allowMissingColumns: Boolean, + val ignoreExcessColumns: Boolean, + val quote: Char, + val ignoreSurroundingSpaces: Boolean, + val trimInsideQuoted: Boolean, + val parseParallel: Boolean, + ) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + delimiter: Char = DelimParams.CSV_DELIMITER, + header: List = DelimParams.HEADER, + charset: Charset? = DelimParams.CHARSET, + colTypes: Map = DelimParams.COL_TYPES, + skipLines: Long = DelimParams.SKIP_LINES, + readLines: Long? = DelimParams.READ_LINES, + parserOptions: ParserOptions? = DelimParams.PARSER_OPTIONS, + ignoreEmptyLines: Boolean = DelimParams.IGNORE_EMPTY_LINES, + allowMissingColumns: Boolean = DelimParams.ALLOW_MISSING_COLUMNS, + ignoreExcessColumns: Boolean = DelimParams.IGNORE_EXCESS_COLUMNS, + quote: Char = DelimParams.QUOTE, + ignoreSurroundingSpaces: Boolean = DelimParams.IGNORE_SURROUNDING_SPACES, + trimInsideQuoted: Boolean = DelimParams.TRIM_INSIDE_QUOTED, + parseParallel: Boolean = DelimParams.PARSE_PARALLEL, + ): ReadOptions = + ReadOptions( + delimiter = delimiter, + header = header, + charset = charset, + colTypes = colTypes, + skipLines = skipLines, + readLines = readLines, + parserOptions = parserOptions, + ignoreEmptyLines = ignoreEmptyLines, + allowMissingColumns = allowMissingColumns, + ignoreExcessColumns = ignoreExcessColumns, + quote = quote, + ignoreSurroundingSpaces = ignoreSurroundingSpaces, + trimInsideQuoted = trimInsideQuoted, + parseParallel = parseParallel, + ) + } + } + + override val supportedReadingTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + + public companion object { + internal val EXTENSIONS = setOf("csv", "zip", "gz") + internal val MIME_TYPES = setOf( + "text/csv", + "application/zip", + "application/gzip", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + if (sourceInfo.extension != null && sourceInfo.extension !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return@runCatching DataFrame.readCsv( + url = url, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + @Suppress("RedundantReturnKeyword") + return@runCatching when { + kType.isSubTypeOf() -> { + DataFrame.readCsv( + inputStream = source as InputStream, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + kType.isSubTypeOf() -> { + // early fail + if (opts.delimiter !in source as String) { + return Result.failure( + IllegalStateException("String does not contain delimiter '${opts.delimiter}'"), + ) + } + + DataFrame.readCsvStr( + text = source, + delimiter = opts.delimiter, + header = opts.header, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + else -> return Result.failure(IllegalStateException("Cannot read source of type $kType as CSV")) + } + } + + override val testOrder: Int = 20_000 + + override fun toString(): String = "Csv" +} + +public val DataFrameReadOptions.Companion.Csv: org.jetbrains.kotlinx.dataframe.io.Csv.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.Csv.ReadOptions.Companion + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_CSV = "readCsv" internal class DefaultReadCsvMethod(path: String?, arguments: MethodArguments) : diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt index ecb123b93c..ab34461559 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt @@ -3,12 +3,17 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.charset.Charset import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.typeOf public class TsvDeephaven(private val delimiter: Char = DelimParams.TSV_DELIMITER) : SupportedDataFrameFormat { @@ -33,6 +38,175 @@ public class TsvDeephaven(private val delimiter: Char = DelimParams.TSV_DELIMITE } } +public class Tsv : DataFrameReadSource { + + public data class ReadOptions( + val delimiter: Char, + val header: List, + val charset: Charset?, + val colTypes: Map, + val skipLines: Long, + val readLines: Long?, + val parserOptions: ParserOptions?, + val ignoreEmptyLines: Boolean, + val allowMissingColumns: Boolean, + val ignoreExcessColumns: Boolean, + val quote: Char, + val ignoreSurroundingSpaces: Boolean, + val trimInsideQuoted: Boolean, + val parseParallel: Boolean, + ) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + delimiter: Char = DelimParams.TSV_DELIMITER, + header: List = DelimParams.HEADER, + charset: Charset? = DelimParams.CHARSET, + colTypes: Map = DelimParams.COL_TYPES, + skipLines: Long = DelimParams.SKIP_LINES, + readLines: Long? = DelimParams.READ_LINES, + parserOptions: ParserOptions? = DelimParams.PARSER_OPTIONS, + ignoreEmptyLines: Boolean = DelimParams.IGNORE_EMPTY_LINES, + allowMissingColumns: Boolean = DelimParams.ALLOW_MISSING_COLUMNS, + ignoreExcessColumns: Boolean = DelimParams.IGNORE_EXCESS_COLUMNS, + quote: Char = DelimParams.QUOTE, + ignoreSurroundingSpaces: Boolean = DelimParams.IGNORE_SURROUNDING_SPACES, + trimInsideQuoted: Boolean = DelimParams.TRIM_INSIDE_QUOTED, + parseParallel: Boolean = DelimParams.PARSE_PARALLEL, + ): ReadOptions = + ReadOptions( + delimiter = delimiter, + header = header, + charset = charset, + colTypes = colTypes, + skipLines = skipLines, + readLines = readLines, + parserOptions = parserOptions, + ignoreEmptyLines = ignoreEmptyLines, + allowMissingColumns = allowMissingColumns, + ignoreExcessColumns = ignoreExcessColumns, + quote = quote, + ignoreSurroundingSpaces = ignoreSurroundingSpaces, + trimInsideQuoted = trimInsideQuoted, + parseParallel = parseParallel, + ) + } + } + + override val supportedReadingTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + + public companion object { + internal val EXTENSIONS = setOf("tsv", "zip", "gz") + internal val MIME_TYPE = setOf( + "text/tab-separated-values", + "application/zip", + "application/gzip", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + if (sourceInfo.extension != null && sourceInfo.extension !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPE) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return@runCatching DataFrame.readTsv( + url = url, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + when { + kType.isSubTypeOf() -> { + DataFrame.readTsv( + inputStream = source as InputStream, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + kType.isSubTypeOf() -> { + // early fail + if (opts.delimiter !in source as String) { + return Result.failure( + IllegalStateException("String does not contain delimiter '${opts.delimiter}'"), + ) + } + + DataFrame.readTsvStr( + text = source, + delimiter = opts.delimiter, + header = opts.header, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + else -> return Result.failure(IllegalStateException("Cannot read source of type $kType as TSV")) + } + } + + override val testOrder: Int = 30_000 + + override fun toString(): String = "Tsv" +} + +public val DataFrameReadOptions.Companion.Tsv: org.jetbrains.kotlinx.dataframe.io.Tsv.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.Tsv.ReadOptions.Companion + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_TSV = "readTsv" internal class DefaultReadTsvMethod(path: String?, arguments: MethodArguments) : diff --git a/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..ebcae7710f --- /dev/null +++ b/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1,2 @@ +org.jetbrains.kotlinx.dataframe.io.Csv +org.jetbrains.kotlinx.dataframe.io.Tsv diff --git a/dataframe-excel/api/dataframe-excel.api b/dataframe-excel/api/dataframe-excel.api index 06541cf3c9..9d29cb5790 100644 --- a/dataframe-excel/api/dataframe-excel.api +++ b/dataframe-excel/api/dataframe-excel.api @@ -9,6 +9,47 @@ public final class org/jetbrains/kotlinx/dataframe/io/Excel : org/jetbrains/kotl public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public synthetic fun (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public synthetic fun (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZLkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/lang/String; + public final fun component2 ()I + public final fun component3 ()Ljava/lang/String; + public final fun component4-358K8uM ()Ljava/lang/String; + public final fun component5 ()Ljava/lang/Integer; + public final fun component6 ()Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy; + public final fun component7 ()Z + public final fun component8 ()Z + public final fun copy-vOPuZIo (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZ)Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$ReadOptions; + public static synthetic fun copy-vOPuZIo$default (Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$ReadOptions;Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getColumns ()Ljava/lang/String; + public final fun getFirstRowIsHeader ()Z + public final fun getNameRepairStrategy ()Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy; + public final fun getParseEmptyAsNull ()Z + public final fun getRowsCount ()Ljava/lang/Integer; + public final fun getSheetName ()Ljava/lang/String; + public final fun getSkipRows ()I + public final fun getStringColumns-358K8uM ()Ljava/lang/String; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/FormattingOptions { public fun (Ljava/lang/String;Lorg/apache/poi/ss/usermodel/DataFormatter;)V public synthetic fun (Ljava/lang/String;Lorg/apache/poi/ss/usermodel/DataFormatter;ILkotlin/jvm/internal/DefaultConstructorMarker;)V diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index e76332bfc7..87f0a182f5 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -45,6 +45,9 @@ import kotlin.io.path.exists import kotlin.io.path.fileSize import kotlin.io.path.inputStream import kotlin.io.path.outputStream +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf import java.time.LocalDate as JavaLocalDate import java.time.LocalDateTime as JavaLocalDateTime import java.util.Date as JavaDate @@ -64,6 +67,160 @@ public class Excel : SupportedDataFrameFormat { DefaultReadExcelMethod(pathRepresentation) } +public class ExcelNEW : DataFrameReadSource { + + public data class ReadOptions( + val sheetName: String?, + val skipRows: Int, + val columns: String?, + val stringColumns: StringColumns?, + val rowsCount: Int?, + val nameRepairStrategy: NameRepairStrategy, + val firstRowIsHeader: Boolean, + val parseEmptyAsNull: Boolean, + ) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + sheetName: String? = null, + skipRows: Int = 0, + columns: String? = null, + stringColumns: StringColumns? = null, + rowsCount: Int? = null, + nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, + firstRowIsHeader: Boolean = true, + parseEmptyAsNull: Boolean = true, + ): ReadOptions = + ReadOptions( + sheetName = sheetName, + skipRows = skipRows, + columns = columns, + stringColumns = stringColumns, + rowsCount = rowsCount, + nameRepairStrategy = nameRepairStrategy, + firstRowIsHeader = firstRowIsHeader, + parseEmptyAsNull = parseEmptyAsNull, + ) + } + } + + // String reference paths are normalized to URL by readSourceImpl, so no String entry here; + // Excel is binary, so raw String content isn't a meaningful input either. + override val supportedReadingTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + public companion object { + internal val EXTENSIONS: Set = setOf("xls", "xlsx") + internal val MIME_TYPES: Set = setOf( + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/x-tika-ooxml", + "application/x-tika-msoffice", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + val ext = sourceInfo.extension?.lowercase() + if (ext != null && ext !in EXTENSIONS) return false + val mime = sourceInfo.mimeType?.lowercase() + if (mime != null && mime !in MIME_TYPES) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return@runCatching DataFrame.readExcel( + url = url, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + stringColumns = opts.stringColumns, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + @Suppress("RedundantReturnKeyword") + return@runCatching when { + kType.isSubTypeOf() -> { + DataFrame.readExcel( + inputStream = source as InputStream, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + stringColumns = opts.stringColumns, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + kType.isSubTypeOf() -> { + DataFrame.readExcel( + wb = source as Workbook, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + formattingOptions = opts.stringColumns?.toFormattingOptions(), + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + kType.isSubTypeOf() -> { + // readExcel(Sheet) has no sheetName parameter — the sheet is already selected. + DataFrame.readExcel( + sheet = source as Sheet, + columns = opts.columns, + formattingOptions = opts.stringColumns?.toFormattingOptions(), + skipRows = opts.skipRows, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + else -> return Result.failure(IllegalStateException("Cannot read source of type $kType as Excel")) + } + } + + override val testOrder: Int = 40_000 + + override fun toString(): String = "Xlsx" +} + +public val DataFrameReadOptions.Companion.Excel: org.jetbrains.kotlinx.dataframe.io.ExcelNEW.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.ExcelNEW.ReadOptions.Companion + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val MESSAGE_REMOVE_1_1 = "Will be removed in 1.1." internal const val READ_EXCEL_OLD = "This function is only here for binary compatibility. $MESSAGE_REMOVE_1_1" diff --git a/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..9ceeadf039 --- /dev/null +++ b/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1 @@ +org.jetbrains.kotlinx.dataframe.io.ExcelNEW diff --git a/dataframe-jdbc/api/dataframe-jdbc.api b/dataframe-jdbc/api/dataframe-jdbc.api index 43b8f1cf82..104fab5d6c 100644 --- a/dataframe-jdbc/api/dataframe-jdbc.api +++ b/dataframe-jdbc/api/dataframe-jdbc.api @@ -25,6 +25,42 @@ public final class org/jetbrains/kotlinx/dataframe/io/Jdbc : org/jetbrains/kotli public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/Jdbc2 : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Jdbc2$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;)V + public synthetic fun (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/lang/String; + public final fun component2 ()Ljava/lang/Integer; + public final fun component3 ()Z + public final fun component4 ()Lorg/jetbrains/kotlinx/dataframe/io/db/DbType; + public final fun component5 ()Z + public final fun component6 ()Lkotlin/jvm/functions/Function1; + public final fun component7 ()Ljava/sql/Connection; + public final fun copy (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;)Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$ReadOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$ReadOptions;Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getConfigureStatement ()Lkotlin/jvm/functions/Function1; + public final fun getDbType ()Lorg/jetbrains/kotlinx/dataframe/io/db/DbType; + public final fun getInferNullability ()Z + public final fun getLimit ()Ljava/lang/Integer; + public final fun getResultSetConnection ()Ljava/sql/Connection; + public final fun getSqlQueryOrTableName ()Ljava/lang/String; + public final fun getStrictValidation ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/JdbcSchemaKt { public static final fun buildCodeForDB (Ljava/net/URL;Ljava/lang/String;)Ljava/lang/String; public static final fun getDatabaseCodeGenReader (Lorg/jetbrains/kotlinx/dataframe/codeGen/CodeGenerator$Companion;)Lkotlin/jvm/functions/Function2; diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt index a5307b96d9..a670e8b8a6 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt @@ -5,9 +5,19 @@ import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.Code import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod +import org.jetbrains.kotlinx.dataframe.io.db.DbType +import org.jetbrains.kotlinx.dataframe.io.db.extractDBTypeFromConnection +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.File import java.io.InputStream import java.nio.file.Path +import java.sql.Connection +import java.sql.PreparedStatement +import java.sql.ResultSet +import javax.sql.DataSource +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf // TODO: https://github.com/Kotlin/dataframe/issues/450 public class Jdbc : @@ -36,6 +46,201 @@ public class Jdbc : DefaultReadJdbcMethod(pathRepresentation) } +/** + * [DataFrameReadSource] for JDBC. + * + * Reading from JDBC always needs a "what" (a SQL query or table name) — unlike a file, a [Connection] doesn't + * carry that instruction. Provide it via [ReadOptions.sqlQueryOrTableName]. The only exception is [ResultSet], + * which is already an executed query. + * + * Supported source types: [Connection], [DataSource], [DbConnectionConfig], [ResultSet]. + * + * `readAllSqlTables` returns a `Map` and doesn't fit the single-DataFrame contract; it + * remains as a direct API call. + */ +public class Jdbc2 : DataFrameReadSource { + + public data class ReadOptions( + /** + * SQL query (e.g. `"SELECT * FROM users"`) or table name (e.g. `"users"`). + * Required for [Connection], [DataSource], and [DbConnectionConfig] sources. + * Ignored for [ResultSet] (it's already an executed query). + */ + val sqlQueryOrTableName: String?, + val limit: Int?, + val inferNullability: Boolean, + /** Optional, auto-detected from the source when `null`. */ + val dbType: DbType?, + val strictValidation: Boolean, + val configureStatement: (PreparedStatement) -> Unit, + /** + * Only used when the source is a [ResultSet] and [dbType] is `null`; provides a [Connection] + * to auto-detect the database type. Ignored otherwise. + */ + val resultSetConnection: Connection?, + ) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + sqlQueryOrTableName: String? = null, + limit: Int? = null, + inferNullability: Boolean = true, + dbType: DbType? = null, + strictValidation: Boolean = true, + configureStatement: (PreparedStatement) -> Unit = {}, + resultSetConnection: Connection? = null, + ): ReadOptions = + ReadOptions( + sqlQueryOrTableName = sqlQueryOrTableName, + limit = limit, + inferNullability = inferNullability, + dbType = dbType, + strictValidation = strictValidation, + configureStatement = configureStatement, + resultSetConnection = resultSetConnection, + ) + } + } + + override val supportedReadingTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + @Suppress("RedundantReturnKeyword") + return@runCatching when (source) { + is ResultSet -> when { + opts.dbType != null -> + DataFrame.readResultSet(source, opts.dbType, opts.limit, opts.inferNullability) + + opts.resultSetConnection != null -> + DataFrame.readResultSet( + source, + opts.resultSetConnection, + opts.limit, + opts.inferNullability, + ) + + // Without dbType or a connection we can't read a ResultSet — fall through. + else -> return Result.failure( + IllegalArgumentException( + "ResultSet read requires either ReadOptions.dbType or ReadOptions.resultSetConnection", + ), + ) + } + + is Connection -> { + val query = opts.sqlQueryOrTableName + ?: return Result.failure( + IllegalArgumentException("Connection read requires ReadOptions.sqlQueryOrTableName"), + ) + source.readDataFrame( + sqlQueryOrTableName = query, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + is DataSource -> { + val query = opts.sqlQueryOrTableName + ?: return Result.failure( + IllegalArgumentException("DataSource read requires ReadOptions.sqlQueryOrTableName"), + ) + source.readDataFrame( + sqlQueryOrTableName = query, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + is DbConnectionConfig -> { + val query = opts.sqlQueryOrTableName + ?: return Result.failure( + IllegalArgumentException("DbConnectionConfig read requires ReadOptions.sqlQueryOrTableName"), + ) + source.readDataFrame( + sqlQueryOrTableName = query, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + else -> return Result.failure(IllegalStateException("Unsupported JDBC source type: ${source::class}")) + } + } + + override fun readDataFrameSchema( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + when (source) { + // ResultSet has a true zero-row metadata-only path. + is ResultSet -> when { + opts.dbType != null -> + DataFrameSchema.readResultSet(source, opts.dbType) + + opts.resultSetConnection != null -> + DataFrameSchema.readResultSet(source, extractDBTypeFromConnection(opts.resultSetConnection)) + + else -> error("ResultSet schema read requires either ReadOptions.dbType or ReadOptions.resultSetConnection") + } + + is Connection -> { + val query = opts.sqlQueryOrTableName + ?: error("Connection schema read requires ReadOptions.sqlQueryOrTableName") + source.readDataFrameSchema(sqlQueryOrTableName = query, dbType = opts.dbType) + } + + is DataSource -> { + val query = opts.sqlQueryOrTableName + ?: error("DataSource schema read requires ReadOptions.sqlQueryOrTableName") + source.readDataFrameSchema(sqlQueryOrTableName = query, dbType = opts.dbType) + } + + is DbConnectionConfig -> { + val query = opts.sqlQueryOrTableName + ?: error("DbConnectionConfig schema read requires ReadOptions.sqlQueryOrTableName") + source.readDataFrameSchema(sqlQueryOrTableName = query, dbType = opts.dbType) + } + + else -> error("Unsupported source type: ${source::class}") + } + } + + override val testOrder: Int = 50_000 + + override fun toString(): String = "Jdbc" +} + +public val DataFrameReadOptions.Companion.Jdbc: org.jetbrains.kotlinx.dataframe.io.Jdbc2.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.Jdbc2.ReadOptions.Companion + private fun DataFrame.Companion.readJDBC(stream: File): DataFrame<*> { TODO("Not yet implemented") } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index e454dec406..80b54229b3 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -43,6 +43,7 @@ import org.duckdb.JsonNode import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.Infer +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.asColumnGroup import org.jetbrains.kotlinx.dataframe.api.asDataColumn import org.jetbrains.kotlinx.dataframe.api.inferType @@ -158,7 +159,7 @@ public object DuckDb : AdvancedDbType("duckdb") { .withColumnBuilder(targetSchema = null) { name, values, inferNullability -> values .toColumn(name, if (inferNullability) Infer.Nulls else Infer.None) - .tryParse() + .tryParse(ParserOptions(parseToDataFrameReadSource = true)) .inferType() } diff --git a/dataframe-jdbc/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-jdbc/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..9cb5a85457 --- /dev/null +++ b/dataframe-jdbc/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1 @@ +org.jetbrains.kotlinx.dataframe.io.Jdbc2 diff --git a/dataframe-json/api/dataframe-json.api b/dataframe-json/api/dataframe-json.api index 368faf4a12..0f04740b98 100644 --- a/dataframe-json/api/dataframe-json.api +++ b/dataframe-json/api/dataframe-json.api @@ -40,6 +40,40 @@ public final class org/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic : jav public static fun values ()[Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; } +public final class org/jetbrains/kotlinx/dataframe/io/Json : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Json$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Json$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Json$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;Z)V + public synthetic fun (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/util/List; + public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; + public final fun component3 ()Ljava/util/List; + public final fun component4 ()Z + public final fun copy (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/io/Json$ReadOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Json$ReadOptions;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Json$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getHeader ()Ljava/util/List; + public final fun getKeyValuePaths ()Ljava/util/List; + public final fun getTypeClashTactic ()Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; + public final fun getUnifyNumbers ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/JsonKt { public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame; diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 4949d1f104..15f9ba1d69 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -2,7 +2,9 @@ package org.jetbrains.kotlinx.dataframe.io import kotlinx.serialization.ExperimentalSerializationApi import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonArray import kotlinx.serialization.json.JsonElement +import kotlinx.serialization.json.JsonObject import kotlinx.serialization.json.decodeFromStream import org.intellij.lang.annotations.Language import org.jetbrains.kotlinx.dataframe.AnyFrame @@ -23,15 +25,281 @@ import org.jetbrains.kotlinx.dataframe.impl.io.encodeDataFrameWithMetadata import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame import org.jetbrains.kotlinx.dataframe.impl.io.encodeRow import org.jetbrains.kotlinx.dataframe.impl.io.readJsonImpl +import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.GZIP_ON +import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.LIMIT_SIZE_ON import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS import java.io.File import java.io.InputStream +import java.io.OutputStream import java.net.URL import java.nio.file.Path import kotlin.io.path.writeText +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.full.isSupertypeOf import kotlin.reflect.typeOf +public class Json : + DataFrameReadSource, + DataFrameWriteTarget { + + public data class ReadOptions( + val header: List, + val typeClashTactic: TypeClashTactic, + val keyValuePaths: List, + val unifyNumbers: Boolean, + ) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + header: List = emptyList(), + typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, + keyValuePaths: List = emptyList(), + unifyNumbers: Boolean = true, + ): ReadOptions = + ReadOptions( + header = header, + typeClashTactic = typeClashTactic, + keyValuePaths = keyValuePaths, + unifyNumbers = unifyNumbers, + ) + } + } + + override val supportedReadingTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + public data class WriteOptions(val prettyPrint: Boolean) : DataFrameWriteOptions { + public companion object { + public operator fun invoke(prettyPrint: Boolean = false): WriteOptions = + WriteOptions(prettyPrint = prettyPrint) + } + } + + override val supportedWritingTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + // used like df.write({ json: JsonElement -> }) + typeOf>(), + typeOf>(), + typeOf>(), + ) + + public companion object { + internal const val EXTENSION = "json" + internal val MIME_TYPES = setOf( + "application/json", + "application/x-json", + "text/json", + "text/x-json", + ) + } + + override fun acceptsTarget(sourceInfo: DataSourceInfo, options: DataFrameWriteOptions?): Boolean { + if (options != null && options !is WriteOptions) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedWritingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun writeDataRow( + dataRow: DataRow<*>, + target: Any, + targetInfo: DataSourceInfo, + options: DataFrameWriteOptions?, + ): Result = + runCatching { + val opts = (options ?: WriteOptions()) as WriteOptions + val kType = targetInfo.kType + + @Suppress("RedundantReturnKeyword") + return@runCatching when { + kType.isSubTypeOf() -> + dataRow.writeJson(path = target as Path, prettyPrint = opts.prettyPrint) + + kType.isSubTypeOf() -> + dataRow.writeJson(path = (target as File).toPath(), prettyPrint = opts.prettyPrint) + + kType.isSubTypeOf() -> + dataRow.writeJson(writer = target as Appendable, prettyPrint = opts.prettyPrint) + + kType.isSubTypeOf() -> + dataRow.writeJson(stream = target as OutputStream, prettyPrint = opts.prettyPrint) + + kType.isSubTypeOf>() -> { + (target as Function1).invoke( + dataRow.toJsonElement(prettyPrint = opts.prettyPrint), + ) + Unit + } + + kType.isSubTypeOf>() -> + return Result.failure( + IllegalArgumentException( + "Can only turn a single DataRow into a JsonObject. A DataFrame can only be converted to a JsonArray.", + ), + ) + + kType.isSubTypeOf>() -> { + (target as Function1).invoke( + dataRow.toJson(prettyPrint = opts.prettyPrint), + ) + Unit + } + + else -> return Result.failure( + IllegalStateException("Unsupported target type for JSON writing: $kType"), + ) + } + } + + override fun writeDataFrame( + dataFrame: DataFrame<*>, + target: Any, + targetInfo: DataSourceInfo, + options: DataFrameWriteOptions?, + ): Result = + runCatching { + val opts = (options ?: WriteOptions()) as WriteOptions + val kType = targetInfo.kType + + @Suppress("RedundantReturnKeyword") + return@runCatching when { + kType.isSubTypeOf() -> + dataFrame.writeJson(path = target as Path, prettyPrint = opts.prettyPrint) + + kType.isSubTypeOf() -> + dataFrame.writeJson(path = (target as File).toPath(), prettyPrint = opts.prettyPrint) + + kType.isSubTypeOf() -> + dataFrame.writeJson(writer = target as Appendable, prettyPrint = opts.prettyPrint) + + kType.isSubTypeOf() -> + dataFrame.writeJson(stream = target as OutputStream, prettyPrint = opts.prettyPrint) + + kType.isSubTypeOf>() -> { + (target as Function1).invoke( + dataFrame.toJsonElement(prettyPrint = opts.prettyPrint), + ) + Unit + } + + kType.isSubTypeOf>() -> + return Result.failure( + IllegalArgumentException( + "Can only turn a single DataRow into a JsonObject. A DataFrame can only be converted to a JsonArray.", + ), + ) + + kType.isSubTypeOf>() -> { + (target as Function1).invoke( + dataFrame.toJson(prettyPrint = opts.prettyPrint), + ) + Unit + } + + else -> return Result.failure( + IllegalStateException("Unsupported target type for JSON writing: $kType"), + ) + } + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + @OptIn(ExperimentalSerializationApi::class) + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return@runCatching DataFrame.readJson( + url = url, + header = opts.header, + typeClashTactic = opts.typeClashTactic, + keyValuePaths = opts.keyValuePaths, + unifyNumbers = opts.unifyNumbers, + ) + } + + val element: JsonElement = when { + kType.isSubTypeOf() -> + Json.decodeFromStream(source as InputStream) + + kType.isSubTypeOf() -> { + if ((source as String).isNotJson()) { + return Result.failure( + IllegalArgumentException("Source string is not valid JSON"), + ) + } + Json.decodeFromString(source) + } + + kType.isSubTypeOf() -> + source as JsonElement + + else -> return Result.failure(IllegalStateException("Unsupported JSON source type: $kType")) + } + + return@runCatching readJsonImpl( + parsed = element, + header = opts.header, + typeClashTactic = opts.typeClashTactic, + keyValuePaths = opts.keyValuePaths, + unifyNumbers = opts.unifyNumbers, + ) + } + + override val testOrder: Int = 10_000 + + override fun toString(): String = "Json" + + // early-exit check for String to see if it's definitely not json + private fun String.isNotJson(): Boolean = + trim().let { + it.isEmpty() || + !( + (it.startsWith('{') && it.endsWith('}')) || + (it.startsWith('[') && it.endsWith(']')) + ) + } +} + +public val DataFrameReadOptions.Companion.Json: org.jetbrains.kotlinx.dataframe.io.Json.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.Json.ReadOptions.Companion + +public val DataFrameWriteOptions.Companion.Json: org.jetbrains.kotlinx.dataframe.io.Json.WriteOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.Json.WriteOptions.Companion + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + +private inline fun KType.isSuperTypeOf(): Boolean = this.isSupertypeOf(typeOf()) + public class JSON( private val typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, private val keyValuePaths: List = emptyList(), @@ -346,6 +614,29 @@ public fun AnyFrame.toJson(prettyPrint: Boolean = false): String { return json.encodeToString(JsonElement.serializer(), encodeFrame(this@toJson)) } +public fun AnyFrame.toJsonElement(prettyPrint: Boolean = false): JsonArray { + val json = Json { + this.prettyPrint = prettyPrint + isLenient = true + allowSpecialFloatingPointValues = true + } + val res = json.encodeToJsonElement(JsonElement.serializer(), encodeFrame(this@toJsonElement)) + return res as JsonArray +} + +public fun AnyRow.toJsonElement(prettyPrint: Boolean = false): JsonObject { + val json = Json { + this.prettyPrint = prettyPrint + isLenient = true + allowSpecialFloatingPointValues = true + } + val res = json.encodeToJsonElement( + JsonElement.serializer(), + encodeRow(this@toJsonElement.df(), this@toJsonElement.index()), + ) + return res as JsonObject +} + /** * Converts the DataFrame to a JSON string representation with additional metadata about serialized data. * It is heavily used to implement some integration features in Kotlin Notebook IntelliJ IDEA plugin. @@ -457,6 +748,10 @@ public fun AnyFrame.writeJson(writer: Appendable, prettyPrint: Boolean = false) writer.append(toJson(prettyPrint)) } +public fun AnyFrame.writeJson(stream: OutputStream, prettyPrint: Boolean = false) { + stream.write(toJson(prettyPrint).toByteArray()) +} + public fun AnyRow.writeJson(file: File, prettyPrint: Boolean = false) { writeJson(file.toPath(), prettyPrint) } @@ -473,6 +768,10 @@ public fun AnyRow.writeJson(writer: Appendable, prettyPrint: Boolean = false) { writer.append(toJson(prettyPrint)) } +public fun AnyRow.writeJson(stream: OutputStream, prettyPrint: Boolean = false) { + stream.write(toJson(prettyPrint).toByteArray()) +} + private const val READ_JSON = "readJson" internal class DefaultReadJsonMethod(path: String?, arguments: MethodArguments) : diff --git a/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameIO b/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameIO new file mode 100644 index 0000000000..bb9b992aea --- /dev/null +++ b/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameIO @@ -0,0 +1 @@ +org.jetbrains.kotlinx.dataframe.io.Json diff --git a/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt b/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt new file mode 100644 index 0000000000..1a6f3430bc --- /dev/null +++ b/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt @@ -0,0 +1,215 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.shouldBe +import org.intellij.lang.annotations.Language +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.asColumnGroup +import org.jetbrains.kotlinx.dataframe.api.asFrameColumn +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.getColumnGroup +import org.jetbrains.kotlinx.dataframe.api.getFrameColumn +import org.jetbrains.kotlinx.dataframe.api.isColumnGroup +import org.jetbrains.kotlinx.dataframe.api.isFrameColumn +import org.jetbrains.kotlinx.dataframe.api.parse +import org.jetbrains.kotlinx.dataframe.api.parser +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.FrameColumn +import org.junit.jupiter.api.AfterAll +import org.junit.jupiter.api.BeforeAll +import kotlin.reflect.typeOf +import kotlin.test.Test + +/** + * Tests that strings containing JSON in a [String] column can be parsed via [parse], + * where JSON arrays become [DataFrame]s (forming a [FrameColumn]) + * and JSON objects become [DataRow]s (forming a [ColumnGroup]). + */ +class ParseJsonColumnTests { + + companion object { + @[BeforeAll JvmStatic] + fun `setup ParserOptions`() { + DataFrame.parser.parseToDataFrameReadSource = true + } + + @[AfterAll JvmStatic] + fun `reset ParserOptions`() { + DataFrame.parser.resetToDefault() + } + } + + @Test + fun `parse column of json arrays into FrameColumn`() { + @Language("json") + val a = """[1, 2, 3]""" + + @Language("json") + val b = """[4, 5, 6]""" + + val col = columnOf(a, b) + val parsed = col.parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["value"].values().toList() shouldBe listOf(1, 2, 3) + frameCol[1]["value"].values().toList() shouldBe listOf(4, 5, 6) + } + + @Test + fun `parse column of json objects into ColumnGroup`() { + @Language("json") + val a = """{"x": 1, "y": "a"}""" + + @Language("json") + val b = """{"x": 2, "y": "b"}""" + + val col = columnOf(a, b) + val parsed = col.parse() + + parsed.isColumnGroup() shouldBe true + val group = parsed.asColumnGroup() + group.columnsCount() shouldBe 2 + group["x"].type() shouldBe typeOf() + group["y"].type() shouldBe typeOf() + group["x"].values().toList() shouldBe listOf(1, 2) + group["y"].values().toList() shouldBe listOf("a", "b") + } + + @Test + fun `parse dataframe column of json arrays into FrameColumn`() { + @Language("json") + val a = """[10, 20]""" + + @Language("json") + val b = """[30, 40, 50]""" + + val df = dataFrameOf("data")(a, b) + val parsed = df.parse("data") + + parsed.rowsCount() shouldBe 2 + parsed["data"].isFrameColumn() shouldBe true + val frameCol = parsed.getFrameColumn("data") + frameCol[0]["value"].values().toList() shouldBe listOf(10, 20) + frameCol[1]["value"].values().toList() shouldBe listOf(30, 40, 50) + } + + @Test + fun `parse dataframe column of json objects into ColumnGroup`() { + @Language("json") + val a = """{"name": "Alice", "age": 30}""" + + @Language("json") + val b = """{"name": "Bob", "age": 25}""" + + val df = dataFrameOf("person")(a, b) + val parsed = df.parse("person") + + parsed.rowsCount() shouldBe 2 + parsed["person"].isColumnGroup() shouldBe true + val group = parsed.getColumnGroup("person") + group.columnsCount() shouldBe 2 + group["name"].values().toList() shouldBe listOf("Alice", "Bob") + group["age"].values().toList() shouldBe listOf(30, 25) + group["name"].type() shouldBe typeOf() + group["age"].type() shouldBe typeOf() + } + + @Test + fun `parse column of json arrays of objects`() { + @Language("json") + val a = """[{"k": 1}, {"k": 2}]""" + + @Language("json") + val b = """[{"k": 3}, {"k": 4}, {"k": 5}]""" + + val parsed = columnOf(a, b).parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["k"].values().toList() shouldBe listOf(1, 2) + frameCol[1]["k"].values().toList() shouldBe listOf(3, 4, 5) + } + + @Test + fun `parse column of nested json objects`() { + @Language("json") + val a = """{"outer": {"inner": 1}}""" + + @Language("json") + val b = """{"outer": {"inner": 2}}""" + + val parsed = columnOf(a, b).parse() + + parsed.isColumnGroup() shouldBe true + val outer = parsed.asColumnGroup().getColumnGroup("outer") + outer["inner"].type() shouldBe typeOf() + outer["inner"].values().toList() shouldBe listOf(1, 2) + } + + @Test + fun `parse column of json objects containing arrays`() { + @Language("json") + val a = """{"name": "list1", "values": [1, 2, 3]}""" + + @Language("json") + val b = """{"name": "list2", "values": [4, 5]}""" + + val parsed = columnOf(a, b).parse() + + parsed.isColumnGroup() shouldBe true + val group = parsed.asColumnGroup() + group["name"].values().toList() shouldBe listOf("list1", "list2") + group["values"].type() shouldBe typeOf>() + group["values"].values().toList() shouldBe listOf(listOf(1, 2, 3), listOf(4, 5)) + } + + @Test + fun `parse column of json arrays with whitespace`() { + val col = columnOf(" [1, 2, 3] ", "\n[4, 5]\t") + val parsed = col.parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["value"].values().toList() shouldBe listOf(1, 2, 3) + frameCol[1]["value"].values().toList() shouldBe listOf(4, 5) + } + + @Test + fun `parse dataframe with multiple json columns`() { + @Language("json") + val obj1 = """{"a": 1}""" + + @Language("json") + val obj2 = """{"a": 2}""" + + @Language("json") + val arr1 = """[1, 2]""" + + @Language("json") + val arr2 = """[3, 4]""" + + val df = dataFrameOf("obj", "arr")( + obj1, + arr1, + obj2, + arr2, + ) + val parsed = df.parse() + + parsed.rowsCount() shouldBe 2 + parsed["obj"].isColumnGroup() shouldBe true + parsed["arr"].isFrameColumn() shouldBe true + + val objGroup = parsed.getColumnGroup("obj") + objGroup["a"].values().toList() shouldBe listOf(1, 2) + + val arrFrame = parsed.getFrameColumn("arr") + arrFrame[0]["value"].values().toList() shouldBe listOf(1, 2) + arrFrame[1]["value"].values().toList() shouldBe listOf(3, 4) + } +} diff --git a/dataframe-openapi-generator/api/dataframe-openapi-generator.api b/dataframe-openapi-generator/api/dataframe-openapi-generator.api index cc65f36f8e..6ad417358a 100644 --- a/dataframe-openapi-generator/api/dataframe-openapi-generator.api +++ b/dataframe-openapi-generator/api/dataframe-openapi-generator.api @@ -19,6 +19,42 @@ public final class org/jetbrains/kotlinx/dataframe/io/OpenApi : org/jetbrains/ko public static synthetic fun readCodeForGeneration$default (Lorg/jetbrains/kotlinx/dataframe/io/OpenApi;Ljava/lang/String;Ljava/lang/String;ZZILjava/lang/Object;)Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2 : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2$ReadOptions : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)V + public synthetic fun (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/util/List; + public final fun component2 ()Lio/swagger/v3/parser/core/models/ParseOptions; + public final fun component3 ()Z + public final fun component4 ()Z + public final fun component5 ()Lorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility; + public final fun copy (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$ReadOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$ReadOptions;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$ReadOptions; + public fun equals (Ljava/lang/Object;)Z + public final fun getAuth ()Ljava/util/List; + public final fun getExtensionProperties ()Z + public final fun getGenerateHelperCompanionObject ()Z + public final fun getParseOptions ()Lio/swagger/v3/parser/core/models/ParseOptions; + public final fun getVisibility ()Lorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/ReadOpenapiKt { public static final fun readOpenApi (Ljava/lang/String;Ljava/lang/String;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)Ljava/lang/String; public static synthetic fun readOpenApi$default (Ljava/lang/String;Ljava/lang/String;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILjava/lang/Object;)Ljava/lang/String; diff --git a/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt b/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt index d3650fa791..c5069409a3 100644 --- a/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt +++ b/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt @@ -1,10 +1,179 @@ package org.jetbrains.kotlinx.dataframe.io +import io.swagger.v3.parser.core.models.AuthorizationValue +import io.swagger.v3.parser.core.models.ParseOptions +import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.DataSchema +import org.jetbrains.kotlinx.dataframe.api.CodeString import org.jetbrains.kotlinx.dataframe.codeGen.Code import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod +import org.jetbrains.kotlinx.dataframe.codeGen.MarkerVisibility +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.file.Path +import kotlin.io.path.readText +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf + +/** + * [DataFrameReadSource] for OpenAPI specifications. + * + * OpenAPI doesn't produce a `DataFrame` or a single `DataFrameSchema` — its output is a multi-marker code + * blob (interfaces + enums + typealiases). Only [readDataSchemaCode] is overridden; the DataFrame + * and Schema methods return a failed [Result], so calling `DataFrame.readSource(openapiFile)` falls + * through to JSON, while `CodeString.readSource(openapiFile, name)` dispatches here. + * + * `.yaml`/`.yml` files are unambiguously OpenAPI; `.json` files are disambiguated at read time by + * [isOpenApiStr] failing early when the JSON isn't actually an OpenAPI spec, letting the framework + * fall through to the JSON format for plain data. + */ +public class OpenApi2 : DataFrameReadSource { + + public data class ReadOptions( + val auth: List?, + val parseOptions: ParseOptions?, + val extensionProperties: Boolean, + val generateHelperCompanionObject: Boolean, + val visibility: MarkerVisibility, + ) : DataFrameReadOptions { + public companion object { + public operator fun invoke( + auth: List? = null, + parseOptions: ParseOptions? = null, + extensionProperties: Boolean = false, + generateHelperCompanionObject: Boolean = false, + visibility: MarkerVisibility = MarkerVisibility.IMPLICIT_PUBLIC, + ): ReadOptions = + ReadOptions( + auth = auth, + parseOptions = parseOptions, + extensionProperties = extensionProperties, + generateHelperCompanionObject = generateHelperCompanionObject, + visibility = visibility, + ) + } + } + + override val supportedReadingTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + + public companion object { + internal val EXTENSIONS: Set = setOf("yaml", "yml", "json") + internal val MIME_TYPES = setOf( + "application/vnd.oai.openapi", + "application/vnd.oai.openapi+json", + "application/vnd.oai.openapi.yaml", + "application/vnd.oai.openapi+yaml", + "text/x-yaml", + "text/yaml", + "application/x-yaml", + "application/yaml", + "application/x-json", + "application/json", + "text/x-json", + "text/json", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is ReadOptions) return false + val ext = sourceInfo.extension?.lowercase() + if (ext != null && ext !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedReadingTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + // OpenAPI doesn't produce a DataFrame. + override fun readDataFrame( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result> = Result.failure(UnsupportedOperationException("OpenAPI does not produce a DataFrame")) + + // ...nor a single DataFrameSchema, it can produce enums, typealiases, etc. + // so it only supports readDataSchemaCode() + override fun readDataFrameSchema( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): Result = + Result.failure(UnsupportedOperationException("OpenAPI does not produce a single DataFrameSchema")) + + override fun readDataSchemaCode( + source: Any, + sourceInfo: DataSourceInfo, + name: String, + options: DataFrameReadOptions?, + ): Result = + runCatching { + val opts = (options ?: ReadOptions()) as ReadOptions + val kType = sourceInfo.kType + + // Resolve to OpenAPI-spec text, returning null if the content isn't OpenAPI. + val text: String = when { + kType.isSubtypeOf(typeOf()) -> { + if (!isOpenApi(source as URL)) { + return Result.failure(IllegalStateException("URL does not point to an OpenAPI spec")) + } + source.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + if (!isOpenApi(source as Path)) { + return Result.failure(IllegalStateException("Path does not point to an OpenAPI spec")) + } + source.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + if (!isOpenApi((source as File).toPath())) { + return Result.failure(IllegalStateException("File does not point to an OpenAPI spec")) + } + source.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + if (!isOpenApiStr(source as String)) { + return Result.failure(IllegalStateException("String content is not an OpenAPI spec")) + } + source + } + + kType.isSubtypeOf(typeOf()) -> { + val text = (source as InputStream).bufferedReader().readText() + if (!isOpenApiStr(text)) { + return Result.failure(IllegalStateException("InputStream content is not an OpenAPI spec")) + } + text + } + + else -> error("Unsupported source type: $kType") + } + + CodeString( + readOpenApiAsString( + openApiAsString = text, + name = name, + auth = opts.auth, + options = opts.parseOptions, + extensionProperties = opts.extensionProperties, + generateHelperCompanionObject = opts.generateHelperCompanionObject, + visibility = opts.visibility, + ), + ) + } + + // Run before Json (10_000) so .json files get the OpenAPI content check first. + override val testOrder: Int = 9_000 + + override fun toString(): String = "OpenApi" +} + +public val DataFrameReadOptions.Companion.OpenApi: org.jetbrains.kotlinx.dataframe.io.OpenApi2.ReadOptions.Companion + get() = org.jetbrains.kotlinx.dataframe.io.OpenApi2.ReadOptions.Companion /** * Allows for OpenApi type schemas to be converted to [DataSchema] interfaces. diff --git a/dataframe-openapi-generator/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-openapi-generator/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..aaf49465bf --- /dev/null +++ b/dataframe-openapi-generator/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1 @@ +org.jetbrains.kotlinx.dataframe.io.OpenApi2 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ed04b6e669..5f5cd5aa86 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -34,6 +34,7 @@ commonsCsv = "1.14.1" commonsCompress = "1.28.0" commonsIo = "2.21.0" commonsStatistics = "1.2" +tika = "3.3.0" serialization = "1.11.0" poi = "5.5.1" mariadb = "3.5.8" @@ -100,6 +101,7 @@ commonsCsv = { group = "org.apache.commons", name = "commons-csv", version.ref = commonsCompress = { group = "org.apache.commons", name = "commons-compress", version.ref = "commonsCompress" } commonsIo = { group = "commons-io", name = "commons-io", version.ref = "commonsIo" } commonsStatisticsDescriptive = { group = "org.apache.commons", name = "commons-statistics-descriptive", version.ref = "commonsStatistics" } +tika = { group = "org.apache.tika", name = "tika-core", version.ref = "tika" } # Serialization serialization-core = { group = "org.jetbrains.kotlinx", name = "kotlinx-serialization-core", version.ref = "serialization" }