apache
diff --git a/‎docs/modules/ROOT/nav.adoc‎
Lines changed: 1 addition & 0 deletions b/‎docs/modules/ROOT/nav.adoc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/modules/ROOT/pages/pipes/index.adoc‎
Lines changed: 1 addition & 0 deletions b/‎docs/modules/ROOT/pages/pipes/index.adoc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/modules/ROOT/pages/pipes/parse-modes.adoc‎
Lines changed: 139 additions & 0 deletions b/‎docs/modules/ROOT/pages/pipes/parse-modes.adoc‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎docs/modules/ROOT/pages/using-tika/cli/index.adoc‎
Lines changed: 75 additions & 0 deletions b/‎docs/modules/ROOT/pages/using-tika/cli/index.adoc‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎docs/modules/ROOT/pages/using-tika/java-api/index.adoc‎
Lines changed: 13 additions & 1 deletion b/‎docs/modules/ROOT/pages/using-tika/java-api/index.adoc‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎docs/modules/ROOT/pages/using-tika/server/index.adoc‎
Lines changed: 54 additions & 0 deletions b/‎docs/modules/ROOT/pages/using-tika/server/index.adoc‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java‎
Lines changed: 10 additions & 0 deletions b/‎tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java‎
Lines changed: 10 additions & 0 deletions
@@ -21,6 +21,7 @@
 ** xref:using-tika/cli/index.adoc[Command Line]
 ** xref:using-tika/grpc/index.adoc[gRPC]
 * xref:pipes/index.adoc[Pipes]
+** xref:pipes/parse-modes.adoc[Parse Modes]
 ** xref:pipes/unpack-config.adoc[Extracting Embedded Bytes]
 * xref:configuration/index.adoc[Configuration]
 ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser]
 
@@ -29,6 +29,7 @@ Tika Pipes provides a framework for processing large volumes of documents with:
 
 == Topics
 
+* xref:pipes/parse-modes.adoc[Parse Modes] - Control how documents are parsed and emitted (`RMETA`, `CONCATENATE`, `CONTENT_ONLY`, `UNPACK`)
 * xref:pipes/unpack-config.adoc[Extracting Embedded Bytes] - Extract raw bytes from embedded documents using `ParseMode.UNPACK`
 
 // Add links to specific topics as they are created
 
@@ -0,0 +1,139 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+= Parse Modes
+
+Tika Pipes uses `ParseMode` to control how documents are parsed and how results are emitted.
+The parse mode is set on the `ParseContext` or configured in `PipesConfig`.
+
+== Available Parse Modes
+
+[cols="1,3"]
+|===
+|Mode |Description
+
+|`RMETA`
+|Default mode. Each embedded document produces a separate `Metadata` object.
+Results are returned as a JSON array of metadata objects.
+
+|`CONCATENATE`
+|All content from embedded documents is concatenated into a single content field.
+Results are returned as a single `Metadata` object with all metadata preserved.
+
+|`CONTENT_ONLY`
+|Parses like `CONCATENATE` but emits only the raw extracted content — no JSON wrapper,
+no metadata fields. Useful when you want just the text, markdown, or HTML output.
+
+|`NO_PARSE`
+|Skip parsing entirely. Useful for pipelines that only need to fetch and emit raw bytes.
+
+|`UNPACK`
+|Extract raw bytes from embedded documents. See xref:pipes/unpack-config.adoc[Extracting Embedded Bytes].
+|===
+
+== CONCATENATE Mode
+
+`CONCATENATE` merges all content from embedded documents into a single content field
+while preserving all metadata from parsing:
+
+[source,json]
+----
+{
+  "parseContext": {
+    "parseMode": "CONCATENATE"
+  }
+}
+----
+
+The result is a single `Metadata` object containing the concatenated content in
+`X-TIKA:content` along with all other metadata fields (title, author, content type, etc.).
+
+== CONTENT_ONLY Mode
+
+`CONTENT_ONLY` is designed for use cases where you want just the extracted content
+written to storage — no JSON wrapping, no metadata overhead. This is particularly
+useful for:
+
+* Extracting markdown files from a document corpus
+* Building plain text search indexes
+* Generating HTML versions of documents
+
+[source,json]
+----
+{
+  "parseContext": {
+    "parseMode": "CONTENT_ONLY"
+  }
+}
+----
+
+=== How It Works
+
+1. Documents are parsed identically to `CONCATENATE` mode — all embedded content is
+   merged into a single content field.
+2. A metadata filter automatically strips all metadata except `X-TIKA:content` and
+   `X-TIKA:CONTAINER_EXCEPTION` (for error tracking).
+3. When the emitter is a `StreamEmitter` (such as the filesystem or S3 emitter), the
+   raw content string is written directly as bytes — no JSON serialization.
+
+=== Metadata Filtering
+
+By default, `CONTENT_ONLY` mode applies an `IncludeFieldMetadataFilter` that retains
+only `X-TIKA:content` and `X-TIKA:CONTAINER_EXCEPTION`. If you set your own
+`MetadataFilter` on the `ParseContext`, your filter takes priority.
+
+=== CLI Usage
+
+The `tika-async-cli` batch processor supports `CONTENT_ONLY` via the `--content-only`
+flag:
+
+[source,bash]
+----
+java -jar tika-async-cli.jar -i /input -o /output -h m --content-only
+----
+
+This produces `.md` files (when using the `m` handler type) containing only the
+extracted markdown content.
+
+=== Content Handler Types
+
+The content format depends on the configured handler type:
+
+[cols="1,1,2"]
+|===
+|Handler |Extension |Description
+
+|`t` (text)
+|`.txt`
+|Plain text output
+
+|`h` (html)
+|`.html`
+|HTML output
+
+|`x` (xml)
+|`.xml`
+|XHTML output
+
+|`m` (markdown)
+|`.md`
+|Markdown output
+
+|`b` (body)
+|`.txt`
+|Body content handler output
+|===
@@ -83,6 +83,9 @@ java -jar tika-app.jar [option...] [file|port...]
 |`-t` or `--text`
 |Output plain text
 
+|`--md`
+|Output Markdown
+
 |`-m` or `--metadata`
 |Output metadata only
 
@@ -124,6 +127,13 @@ Process entire directories by specifying input and output paths:
 java -jar tika-app.jar -i /path/to/input -o /path/to/output
 ----
 
+=== Extract Markdown from a file
+
+[source,bash]
+----
+java -jar tika-app.jar --md document.docx
+----
+
 === Custom configuration
 
 Use a custom configuration file:
@@ -132,3 +142,68 @@ Use a custom configuration file:
 ----
 java -jar tika-app.jar --config=tika-config.json document.pdf
 ----
+
+== Batch Processing (tika-async-cli)
+
+For processing large numbers of files, use `tika-async-cli`. It uses the Tika Pipes
+architecture with forked JVM processes for fault tolerance.
+
+=== Basic Batch Usage
+
+[source,bash]
+----
+java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output
+----
+
+This processes all files in the input directory and writes JSON metadata (RMETA format)
+to the output directory.
+
+=== Batch Options
+
+[cols="1,3"]
+|===
+|Option |Description
+
+|`-i`
+|Input directory
+
+|`-o`
+|Output directory
+
+|`-h` or `--handlerType`
+|Content handler type: `t`=text, `h`=html, `x`=xml, `m`=markdown, `b`=body, `i`=ignore (default: `t`)
+
+|`--concatenate`
+|Concatenate content from all embedded documents into a single content field
+
+|`--content-only`
+|Output only extracted content (no metadata, no JSON wrapper); implies `--concatenate`
+
+|`-T` or `--timeoutMs`
+|Timeout for each parse in milliseconds
+
+|`-n` or `--numClients`
+|Number of parallel forked processes
+
+|`-p` or `--pluginsDir`
+|Plugins directory
+|===
+
+=== Batch Examples
+
+Extract markdown content only (no metadata) from all files:
+
+[source,bash]
+----
+java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output -h m --content-only
+----
+
+This produces `.md` files in the output directory containing just the extracted markdown
+content — no JSON wrappers, no metadata fields.
+
+Extract text with all metadata in concatenated mode:
+
+[source,bash]
+----
+java -jar tika-async-cli.jar -i /path/to/input -o /path/to/output --concatenate
+----
@@ -100,12 +100,24 @@ For example, use `TikaInputStream.get(path)` for a `Path`, or `TikaInputStream.g
 for a `byte[]`. This allows Tika to access the underlying resource efficiently and enables
 features like mark/reset support that many parsers and detectors require.
 
-=== Utility Classes
+=== Content Handlers
+
+Tika provides several content handlers that control the output format:
 
 **BodyContentHandler**:: Extracts and converts the body content to streams or strings.
 
+**ToTextContentHandler**:: Outputs plain text.
+
+**ToHTMLContentHandler**:: Outputs HTML.
+
+**ToXMLContentHandler**:: Outputs XHTML/XML.
+
+**ToMarkdownContentHandler**:: Outputs Markdown, preserving structural semantics like headings, lists, tables, code blocks, emphasis, and links.
+
 **ParsingReader**:: Uses background threading to return extracted text as character streams.
 
+Use `BasicContentHandlerFactory` to create handlers by type: `TEXT`, `HTML`, `XML`, `BODY`, `MARKDOWN`, `IGNORE`.
+
 === Key Metadata Properties
 
 * `TikaCoreProperties.RESOURCE_NAME_KEY` - filename or resource identifier
 
@@ -33,6 +33,60 @@ java -jar tika-server-standard.jar
 
 The server starts on port 9998 by default.
 
+== Endpoints
+
+=== Content Extraction (`/tika`)
+
+The `/tika` endpoint extracts content from a document as plain text.
+
+[source,bash]
+----
+curl -T document.pdf http://localhost:9998/tika
+----
+
+==== Markdown Output (`/tika/md`)
+
+The `/tika/md` endpoint extracts content as Markdown, preserving structural semantics
+like headings, lists, tables, and emphasis:
+
+[source,bash]
+----
+curl -T document.docx http://localhost:9998/tika/md
+----
+
+==== Custom Handler Type
+
+Use the `X-Tika-Handler` header to control the output format. Valid values: `text` (default),
+`html`, `xml`, `markdown`, `ignore`.
+
+[source,bash]
+----
+curl -T document.pdf -H "X-Tika-Handler: markdown" http://localhost:9998/tika
+----
+
+=== Recursive Metadata (`/rmeta`)
+
+The `/rmeta` endpoint returns metadata for the container document and all embedded documents
+as a JSON array of metadata objects.
+
+[source,bash]
+----
+curl -T document.pdf http://localhost:9998/rmeta
+----
+
+Content handler can be specified in the URL path:
+
+* `/rmeta/text` - plain text content (default)
+* `/rmeta/html` - HTML content
+* `/rmeta/xml` - XHTML content
+* `/rmeta/markdown` - Markdown content
+* `/rmeta/ignore` - metadata only, no content
+
+[source,bash]
+----
+curl -T document.docx http://localhost:9998/rmeta/markdown
+----
+
 == Topics
 
 * xref:using-tika/server/tls.adoc[TLS/SSL Configuration] - Secure your server with TLS and mutual authentication
 
@@ -100,6 +100,7 @@
 import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.sax.ExpandedTitleContentHandler;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToMarkdownContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
 import org.apache.tika.serialization.JsonMetadata;
@@ -225,6 +226,12 @@ public void process(TikaInputStream tis, OutputStream output, Metadata metadata)
      * Fork mode plugins directory.
      */
     private String forkPluginsDir = null;
+    private final OutputType MARKDOWN = new OutputType() {
+        @Override
+        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
+            return new BodyContentHandler(new ToMarkdownContentHandler(getOutputWriter(output, encoding)));
+        }
+    };
     private final OutputType XML = new OutputType() {
         @Override
         protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
@@ -483,6 +490,8 @@ public void process(String arg) throws Exception {
             type = XML;
         } else if (arg.equals("-h") || arg.equals("--html")) {
             type = HTML;
+        } else if (arg.equals("--md")) {
+            type = MARKDOWN;
         } else if (arg.equals("-t") || arg.equals("--text")) {
             type = TEXT;
         } else if (arg.equals("-T") || arg.equals("--text-main")) {
@@ -744,6 +753,7 @@ private void usage() {
         out.println("    -x  or --xml           Output XHTML content (default)");
         out.println("    -h  or --html          Output HTML content");
         out.println("    -t  or --text          Output plain text content (body)");
+        out.println("    --md                   Output Markdown content (body)");
         out.println("    -T  or --text-main     Output plain text content (main content only via boilerpipe handler)");
         out.println("    -A  or --text-all      Output all text content");
         out.println("    -m  or --metadata      Output only metadata");