diff --git a/marklogic-client-api/build.gradle b/marklogic-client-api/build.gradle index 3b888233e..f9a8556a5 100644 --- a/marklogic-client-api/build.gradle +++ b/marklogic-client-api/build.gradle @@ -76,6 +76,9 @@ dependencies { testImplementation 'org.skyscreamer:jsonassert:1.5.3' + // ArchUnit for verifying certain classes aren't used + testImplementation 'com.tngtech.archunit:archunit-junit5:1.4.1' + // Automatic loading of test framework implementation dependencies is deprecated. // https://docs.gradle.org/current/userguide/upgrading_version_8.html#test_framework_implementation_dependencies // Without this, once using JUnit 5.12 or higher, Gradle will not find any tests and report an error of: diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/impl/OkHttpServices.java b/marklogic-client-api/src/main/java/com/marklogic/client/impl/OkHttpServices.java index 573f89b0b..716a48f66 100644 --- a/marklogic-client-api/src/main/java/com/marklogic/client/impl/OkHttpServices.java +++ b/marklogic-client-api/src/main/java/com/marklogic/client/impl/OkHttpServices.java @@ -37,8 +37,8 @@ import jakarta.mail.internet.ContentDisposition; import jakarta.mail.internet.MimeMultipart; import jakarta.mail.internet.ParseException; -import jakarta.mail.util.ByteArrayDataSource; import jakarta.xml.bind.DatatypeConverter; +import com.marklogic.client.impl.okhttp.InputStreamDataSource; import okhttp3.*; import okhttp3.MultipartBody.Part; import okhttp3.logging.HttpLoggingInterceptor; @@ -5305,7 +5305,9 @@ static private T getEntity(ResponseBody body, Class as) { } else if (as == MimeMultipart.class) { MediaType mediaType = body.contentType(); String contentType = (mediaType != null) ? mediaType.toString() : "application/x-unknown-content-type"; - ByteArrayDataSource dataSource = new ByteArrayDataSource(body.byteStream(), contentType); + // Use custom DataSource to avoid reading document into memory. Allows a user to use an + // InputStreamHandle to fetch the content without being surprised that all the data is in memory already. + InputStreamDataSource dataSource = new InputStreamDataSource(body.byteStream(), contentType); return (T) new MimeMultipart(dataSource); } else if (as == File.class) { // write out the response body to a temp file in the system temp folder @@ -6055,12 +6057,10 @@ void setResponse(Response response) { setNull(true); return; } - ByteArrayDataSource dataSource = new ByteArrayDataSource( - responseBody.byteStream(), contentType.toString() - ); + // Use custom DataSource to avoid reading document into memory. Allows a user to use an + // InputStreamHandle to fetch the content without being surprised that all the data is in memory already. + InputStreamDataSource dataSource = new InputStreamDataSource(responseBody.byteStream(), contentType.toString()); setMultipart(new MimeMultipart(dataSource)); - } catch (IOException e) { - throw new MarkLogicIOException(e); } catch (MessagingException e) { throw new MarkLogicIOException(e); } diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/impl/okhttp/InputStreamDataSource.java b/marklogic-client-api/src/main/java/com/marklogic/client/impl/okhttp/InputStreamDataSource.java new file mode 100644 index 000000000..035c08cf4 --- /dev/null +++ b/marklogic-client-api/src/main/java/com/marklogic/client/impl/okhttp/InputStreamDataSource.java @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. + */ +package com.marklogic.client.impl.okhttp; + +import jakarta.activation.DataSource; + +import java.io.InputStream; +import java.io.OutputStream; + +/** + * A streaming DataSource implementation that wraps an InputStream without buffering it into memory. + * This is a critical component for enabling true streaming of large documents from MarkLogic, + * avoiding OutOfMemoryErrors when processing large result sets. + *

+ * Unlike ByteArrayDataSource (which loads the entire stream into a byte array), this implementation + * preserves the streaming nature of the underlying InputStream, allowing documents to be processed + * incrementally as they are read from the network. + *

+ * Note: This DataSource is read-only. The getOutputStream() method throws UnsupportedOperationException. + */ +public class InputStreamDataSource implements DataSource { + + private final InputStream inputStream; + private final String contentType; + + /** + * Creates a new InputStreamDataSource. + * + * @param inputStream the InputStream to wrap (will not be buffered into memory) + * @param contentType the MIME type of the data + */ + public InputStreamDataSource(InputStream inputStream, String contentType) { + this.inputStream = inputStream; + this.contentType = contentType; + } + + @Override + public InputStream getInputStream() { + return inputStream; + } + + @Override + public OutputStream getOutputStream() { + throw new UnsupportedOperationException("InputStreamDataSource is read-only"); + } + + @Override + public String getContentType() { + return contentType; + } + + @Override + public String getName() { + return null; + } +} diff --git a/marklogic-client-api/src/test/java/com/marklogic/client/test/VerifyByteArrayDataSourceIsNotUsedTest.java b/marklogic-client-api/src/test/java/com/marklogic/client/test/VerifyByteArrayDataSourceIsNotUsedTest.java new file mode 100644 index 000000000..b873ab0b5 --- /dev/null +++ b/marklogic-client-api/src/test/java/com/marklogic/client/test/VerifyByteArrayDataSourceIsNotUsedTest.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. + */ +package com.marklogic.client.test; + +import com.marklogic.client.impl.okhttp.PartIterator; +import com.tngtech.archunit.core.domain.JavaClasses; +import com.tngtech.archunit.core.importer.ClassFileImporter; +import com.tngtech.archunit.core.importer.ImportOption; +import com.tngtech.archunit.lang.ArchRule; +import jakarta.mail.util.ByteArrayDataSource; +import org.junit.jupiter.api.Test; + +import static com.tngtech.archunit.lang.syntax.ArchRuleDefinition.noClasses; + +/** + * Architecture tests to ensure streaming best practices are followed. + *

+ * ByteArrayDataSource defeats streaming by loading entire InputStreams into memory. + * We use InputStreamDataSource instead to enable true streaming for large documents. + */ +class VerifyByteArrayDataSourceIsNotUsedTest { + + private final JavaClasses classes = new ClassFileImporter() + .withImportOption(ImportOption.Predefined.DO_NOT_INCLUDE_TESTS) + .importPackages("com.marklogic.client"); + + @Test + void shouldNotUseByteArrayDataSourceInProduction() { + ArchRule rule = noClasses() + // PartIterator can use ByteArrayDataSource because for an eval/invoke use case, it's very likely the user + // wants the results in memory so they can perform some operation on them. If this proves false in the + // future, PartIterator can be adjusted to use InputStreamDataSource instead. + .that().doNotHaveSimpleName(PartIterator.class.getSimpleName()) + + .should().dependOnClassesThat().haveSimpleName(ByteArrayDataSource.class.getSimpleName()) + .because("MLE-27509 identifies a problem where a multipart response was having each part " + + "processed with the ByteArrayDataSource, which loads the entire stream into memory. This is " + + "surprising to any user using InputStreamHandle to access the content of a document, as that user " + + "is likely expecting to stream document from MarkLogic to some target. The InputStreamDataSource " + + "class was created to avoiding reading the contents of the document into an in-memory byte " + + "array, thus allowing for streaming reads to occur via a multipart response."); + + rule.check(classes); + } +}