From f8ae67d759bfd279a92ab9b7d89b1218f28171c9 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 27 Feb 2026 09:50:56 +0800 Subject: [PATCH 1/8] docs: add unified external meta cache usage (4.0.4+) --- docs/lakehouse/catalogs/hive-catalog.mdx | 46 ++++++++++ docs/lakehouse/catalogs/hudi-catalog.md | 46 ++++++++++ docs/lakehouse/catalogs/iceberg-catalog.mdx | 44 ++++++++++ docs/lakehouse/catalogs/maxcompute-catalog.md | 43 ++++++++++ docs/lakehouse/catalogs/paimon-catalog.mdx | 42 +++++++++ docs/lakehouse/meta-cache.md | 19 ++++- .../meta-cache/unified-meta-cache.md | 84 ++++++++++++++++++ .../lakehouse/catalogs/hive-catalog.mdx | 45 ++++++++++ .../lakehouse/catalogs/hudi-catalog.md | 46 +++++++++- .../lakehouse/catalogs/iceberg-catalog.mdx | 43 ++++++++++ .../lakehouse/catalogs/maxcompute-catalog.md | 42 +++++++++ .../lakehouse/catalogs/paimon-catalog.mdx | 41 +++++++++ .../current/lakehouse/meta-cache.md | 18 ++++ .../meta-cache/unified-meta-cache.md | 85 +++++++++++++++++++ sidebars.ts | 1 + 15 files changed, 643 insertions(+), 2 deletions(-) create mode 100644 docs/lakehouse/meta-cache/unified-meta-cache.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md diff --git a/docs/lakehouse/catalogs/hive-catalog.mdx b/docs/lakehouse/catalogs/hive-catalog.mdx index 7f6ecf352acf3..ed7a79fc09d47 100644 --- a/docs/lakehouse/catalogs/hive-catalog.mdx +++ b/docs/lakehouse/catalogs/hive-catalog.mdx @@ -76,6 +76,52 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering common attributes. Please see the "Common Properties" section in the [Catalog Overview](../catalog-overview.md). +## Metadata Cache (4.0.4+) {#meta-cache-404} + +Starting from Doris 4.0.4, Hive Catalog metadata caches are configured with the unified `meta.cache.*` properties. +This section focuses on **how to use** and **how to observe** the Hive-related cache modules. + +For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). + +### Cache Modules {#meta-cache-404-modules} + +| Module | Property key prefix | Cached content (typical) | +|---|---|---| +| `partition-values` | `meta.cache.hive.partition-values.` | Partition values/names list used by partition pruning and partition enumeration. | +| `partition` | `meta.cache.hive.partition.` | Partition properties (location, input format, storage descriptor, etc.). | +| `file` | `meta.cache.hive.file.` | File listing under partition/table paths (reduces remote LIST overhead). | + +Example (disable file listing cache for freshness): + +```sql +ALTER CATALOG hive_ctl SET PROPERTIES ( + "meta.cache.hive.file.ttl-second" = "0" +); +``` + +### Observability {#meta-cache-404-observability} + +Hive cache metrics are available in `information_schema.catalog_meta_cache_statistics`. +For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + +The `cache_name` values for Hive modules are: + +| Module | cache_name | +|---|---| +| `partition-values` | `hive_partition_values_cache` | +| `partition` | `hive_partition_cache` | +| `file` | `hive_file_cache` | + +Example query (filter one catalog and Hive caches): + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hive_ctl' + AND cache_name LIKE 'hive_%' +ORDER BY cache_name, metric_name; +``` + ### Supported Hive Versions Supports Hive 1.x, 2.x, 3.x, and 4.x. diff --git a/docs/lakehouse/catalogs/hudi-catalog.md b/docs/lakehouse/catalogs/hudi-catalog.md index 22b8f227ac30d..1dcd347cc566a 100644 --- a/docs/lakehouse/catalogs/hudi-catalog.md +++ b/docs/lakehouse/catalogs/hudi-catalog.md @@ -51,6 +51,52 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | Whether to use the partition information already synchronized by Hive Metastore. If true, partition information will be obtained directly from Hive Metastore. Otherwise, it will be obtained from the metadata file of the file system. Obtaining information from Hive Metastore is more efficient, but users need to ensure that the latest metadata has been synchronized to Hive Metastore. | false | +## Metadata Cache (4.0.4+) {#meta-cache-404} + +Starting from Doris 4.0.4, Hudi-related metadata caches are configured with the unified `meta.cache.*` properties. +This section focuses on **how to use** and **how to observe** the Hudi cache modules. + +For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). + +### Cache Modules {#meta-cache-404-modules} + +| Module | Property key prefix | Cached content (typical) | +|---|---|---| +| `partition` | `meta.cache.hudi.partition.` | Hudi partition-related metadata (used by partition discovery/pruning). | +| `fs-view` | `meta.cache.hudi.fs-view.` | Hudi filesystem view related metadata. | +| `meta-client` | `meta.cache.hudi.meta-client.` | Hudi meta client related metadata. | + +Example (reduce cache footprint by lowering capacity): + +```sql +ALTER CATALOG hudi_ctl SET PROPERTIES ( + "meta.cache.hudi.partition.capacity" = "2000" +); +``` + +### Observability {#meta-cache-404-observability} + +Hudi cache metrics are available in `information_schema.catalog_meta_cache_statistics`. +For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + +The `cache_name` values for Hudi modules are: + +| Module | cache_name | +|---|---| +| `partition` | `hudi_partition_cache` | +| `fs-view` | `hudi_fs_view_cache` | +| `meta-client` | `hudi_meta_client_cache` | + +Example query: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hudi_ctl' + AND cache_name LIKE 'hudi_%' +ORDER BY cache_name, metric_name; +``` + ### Supported Hudi Versions The current dependent Hudi version is 0.15. It is recommended to access Hudi data version 0.14 and above. diff --git a/docs/lakehouse/catalogs/iceberg-catalog.mdx b/docs/lakehouse/catalogs/iceberg-catalog.mdx index ce3418e3cc1c3..da090521381bf 100644 --- a/docs/lakehouse/catalogs/iceberg-catalog.mdx +++ b/docs/lakehouse/catalogs/iceberg-catalog.mdx @@ -85,6 +85,50 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering general properties. See the [Catalog Overview](../catalog-overview.md) for details on common properties. +## Metadata Cache (4.0.4+) {#meta-cache-404} + +Starting from Doris 4.0.4, Iceberg Catalog metadata caches are configured with the unified `meta.cache.*` properties. +This section focuses on **how to use** and **how to observe** the Iceberg-related cache modules. + +For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). + +### Cache Modules {#meta-cache-404-modules} + +| Module | Property key prefix | Cached content (typical) | +|---|---|---| +| `table` | `meta.cache.iceberg.table.` | Iceberg table metadata object (reduces catalog/metastore round trips). | +| `manifest` | `meta.cache.iceberg.manifest.` | Manifest-related metadata (reduces repeated manifest access overhead). | + +Example (shorter TTL for manifest to prioritize freshness): + +```sql +ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.ttl-second" = "600" +); +``` + +### Observability {#meta-cache-404-observability} + +Iceberg cache metrics are available in `information_schema.catalog_meta_cache_statistics`. +For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + +The `cache_name` values for Iceberg modules are: + +| Module | cache_name | +|---|---| +| `table` | `iceberg_table_cache` | +| `manifest` | `iceberg_manifest_cache` | + +Example query: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'iceberg_ctl' + AND cache_name LIKE 'iceberg_%' +ORDER BY cache_name, metric_name; +``` + ### Supported Iceberg Versions | Doris Version | Iceberg SDK Version | diff --git a/docs/lakehouse/catalogs/maxcompute-catalog.md b/docs/lakehouse/catalogs/maxcompute-catalog.md index e611f96f12f0e..cf22afa108dbd 100644 --- a/docs/lakehouse/catalogs/maxcompute-catalog.md +++ b/docs/lakehouse/catalogs/maxcompute-catalog.md @@ -111,6 +111,49 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the "Common Properties" section in [Catalog Overview](../catalog-overview.md). +## Metadata Cache (4.0.4+) {#meta-cache-404} + +Starting from Doris 4.0.4, MaxCompute Catalog metadata caches are configured with the unified `meta.cache.*` properties. +This section focuses on **how to use** and **how to observe** the MaxCompute-related cache module. + +For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). + +### Cache Modules {#meta-cache-404-modules} + +| Module | Property key prefix | Cached content (typical) | +|---|---|---| +| `partition-values` | `meta.cache.maxcompute.partition-values.` | Partition values list (reduces repeated remote listing overhead). | + +Example: + +```sql +ALTER CATALOG mc_ctl SET PROPERTIES ( + "meta.cache.maxcompute.partition-values.ttl-second" = "3600", + "meta.cache.maxcompute.partition-values.capacity" = "5000" +); +``` + +### Observability {#meta-cache-404-observability} + +MaxCompute cache metrics are available in `information_schema.catalog_meta_cache_statistics`. +For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + +The `cache_name` value for MaxCompute module is: + +| Module | cache_name | +|---|---| +| `partition-values` | `maxcompute_partition_values_cache` | + +Example query: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'mc_ctl' + AND cache_name LIKE 'maxcompute_%' +ORDER BY cache_name, metric_name; +``` + ### Supported MaxCompute Versions Only the public cloud version of MaxCompute is supported. For private cloud version support, please contact Doris community support. diff --git a/docs/lakehouse/catalogs/paimon-catalog.mdx b/docs/lakehouse/catalogs/paimon-catalog.mdx index 440a48d04d149..0a6a183a40bdd 100644 --- a/docs/lakehouse/catalogs/paimon-catalog.mdx +++ b/docs/lakehouse/catalogs/paimon-catalog.mdx @@ -90,6 +90,48 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the [Catalog Overview](../catalog-overview.md) section on [Common Properties]. +## Metadata Cache (4.0.4+) {#meta-cache-404} + +Starting from Doris 4.0.4, Paimon Catalog metadata caches are configured with the unified `meta.cache.*` properties. +This section focuses on **how to use** and **how to observe** the Paimon-related cache modules. + +For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). + +### Cache Modules {#meta-cache-404-modules} + +| Module | Property key prefix | Cached content (typical) | +|---|---|---| +| `table` | `meta.cache.paimon.table.` | Paimon table metadata used for query planning (schema/snapshot/partition related metadata, depending on workload). | + +Example (disable module cache and always load on demand): + +```sql +ALTER CATALOG paimon_ctl SET PROPERTIES ( + "meta.cache.paimon.table.ttl-second" = "0" +); +``` + +### Observability {#meta-cache-404-observability} + +Paimon cache metrics are available in `information_schema.catalog_meta_cache_statistics`. +For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + +The `cache_name` value for Paimon module is: + +| Module | cache_name | +|---|---| +| `table` | `paimon_table_cache` | + +Example query: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'paimon_ctl' + AND cache_name LIKE 'paimon_%' +ORDER BY cache_name, metric_name; +``` + ### Supported Paimon Versions The currently dependent Paimon version is 1.0.0. diff --git a/docs/lakehouse/meta-cache.md b/docs/lakehouse/meta-cache.md index 16985562ea399..def91a4f57826 100644 --- a/docs/lakehouse/meta-cache.md +++ b/docs/lakehouse/meta-cache.md @@ -18,6 +18,11 @@ For **data cache**, refer to the [data cache documentation](./data-cache.md). This document applies to versions after 2.1.6. ::: +:::note +For Doris 4.0.4 and later, external meta cache has been refactored with unified configuration keys `meta.cache.*`. +See [Unified External Meta Cache (4.0.4+)](./meta-cache/unified-meta-cache.md). +::: + ## Cache Strategies Most caches have the following three strategy indicators: @@ -321,6 +326,12 @@ This section mainly introduces the cache behavior that users may be concerned ab For all types of External Catalogs, if you want to see the latest Table Schema in real time, you can disable the Schema Cache: +:::note +Starting from Doris 4.0.4, the legacy catalog-level cache property `schema.cache.ttl-second` is deprecated. +For 4.0.4+, keep using the FE config method below, and refer to: +[Unified External Meta Cache (4.0.4+)](./meta-cache/unified-meta-cache.md). +::: + - Disable globally ```text @@ -341,6 +352,13 @@ After setting, Doris will see the latest Table Schema in real time. However, thi For Hive Catalog, if you want to disable the cache to query real-time updated data, you can configure the following parameters: +:::note +Starting from Doris 4.0.4, the legacy catalog-level properties `file.meta.cache.ttl-second` and `partition.cache.ttl-second` +are deprecated. Use unified `meta.cache.hive.*` properties instead. See: +[Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-404) and +[Unified External Meta Cache (4.0.4+)](./meta-cache/unified-meta-cache.md). +::: + - Disable globally ```text @@ -363,4 +381,3 @@ After setting the above parameters: - Changes in partition data files can be queried in real time. But this will increase the access pressure on external data sources (such as Hive Metastore and HDFS), which may cause unstable metadata access latency and other phenomena. - diff --git a/docs/lakehouse/meta-cache/unified-meta-cache.md b/docs/lakehouse/meta-cache/unified-meta-cache.md new file mode 100644 index 0000000000000..32a7579f539f2 --- /dev/null +++ b/docs/lakehouse/meta-cache/unified-meta-cache.md @@ -0,0 +1,84 @@ +--- +{ + "title": "Unified External Meta Cache (4.0.4+)", + "language": "en", + "description": "User guide for unified external metadata cache: unified meta.cache.* properties, what is cached, and where to configure per catalog." +} +--- + +Starting from **Doris 4.0.4**, external metadata caching is unified for major External Catalog engines. As a user, you only need to know: + +| You want to know | Where in docs | +|---|---| +| Where to configure | Catalog `PROPERTIES` with `meta.cache.*` keys (see the catalog pages linked below). | +| What it affects | Depends on catalog engine (partitions, file listing, table metadata, manifests, etc.). | +| How to observe | `information_schema.catalog_meta_cache_statistics` (see the observability section below). | + +:::tip +Applies to Doris 4.0.4 and later. +::: + +## Unified Property Model + +All engine cache modules share the same property key pattern: + +`meta.cache...{enable,ttl-second,capacity}` + +The following table describes the property semantics: + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether this cache module is enabled. | +| `ttl-second` | `600`, `0`, `-1` | `0` disables the module; `-1` means no expiration; otherwise expire after access by TTL. | +| `capacity` | `10000` | Max entry count (count-based). `0` disables the module. | + +Example (edit catalog properties): + +```sql +ALTER CATALOG hive_ctl SET PROPERTIES ( + "meta.cache.hive.file.ttl-second" = "0" +); +``` + +## What External Meta Cache Includes + +External meta cache covers different kinds of metadata. Some are configured by unified catalog properties, and some are controlled by FE configs: + +| Category | Examples | How to configure | +|---|---|---| +| Engine module caches | Hive partitions/files, Iceberg manifests, Paimon table metadata, etc. | Catalog `PROPERTIES`: `meta.cache...*` | +| Schema cache | Table schema, isolated by schema version token | FE configs (for example: `max_external_schema_cache_num`) | + +## Catalog-Specific Configuration (Links) + +For each catalog engine, the supported cache modules and the recommended properties are documented in its catalog page: + +| Catalog engine | Where to configure module caches | +|---|---| +| Hive | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-404) | +| Iceberg | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-404) | +| Paimon | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-404) | +| Hudi | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-404) | +| MaxCompute | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-404) | + +## Observability + +Use the system table to observe cache metrics: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +ORDER BY catalog_name, cache_name, metric_name; +``` + +This table is documented at: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + +Naming convention: + +| Field | Convention | +|---|---| +| `cache_name` | `__cache` (module `-` is converted to `_`) | + +## Migration Note (Legacy Properties) + +Starting from Doris 4.0.4, legacy catalog cache properties (for example, `schema.cache.ttl-second`, `file.meta.cache.ttl-second`) are deprecated. Use `meta.cache.*` properties instead and follow the catalog-specific pages above. diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx index 119a293d68c82..ce1e582e851c3 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx @@ -78,6 +78,51 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[ 数据目录概述 ](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存(4.0.4+) {#meta-cache-404} + +从 Doris 4.0.4 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 + +统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 + +### 缓存模块 {#meta-cache-404-modules} + +| 模块 | 属性键前缀 | 典型缓存内容 | +|---|---|---| +| `partition-values` | `meta.cache.hive.partition-values.` | 分区值/分区名称列表(常用于分区剪枝与分区枚举)。 | +| `partition` | `meta.cache.hive.partition.` | 分区属性(location、输入格式、存储描述等)。 | +| `file` | `meta.cache.hive.file.` | 分区/表路径下的文件列表(减少远端 LIST 开销)。 | + +示例(为保证新鲜度,关闭文件列表缓存): + +```sql +ALTER CATALOG hive_ctl SET PROPERTIES ( + "meta.cache.hive.file.ttl-second" = "0" +); +``` + +### 可观测性 {#meta-cache-404-observability} + +Hive 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 +系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + +Hive 各模块对应的 `cache_name` 如下: + +| 模块 | cache_name | +|---|---| +| `partition-values` | `hive_partition_values_cache` | +| `partition` | `hive_partition_cache` | +| `file` | `hive_file_cache` | + +示例(只看某个 catalog 的 Hive 缓存): + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hive_ctl' + AND cache_name LIKE 'hive_%' +ORDER BY cache_name, metric_name; +``` + ### 支持的 Hive 版本 支持 Hive 1.x,2.x,3.x,4.x。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md index cb4ac7cc702bc..0e52dfcdca59b 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md @@ -51,6 +51,51 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | 是否使用 Hive Metastore 已同步的分区信息。如果为 true,则会直接从 Hive Metastore 中获取分区信息。否则,会从文件系统的元数据文件中获取分区信息。通过 Hive Metastore 获取信息性能更好,但需要用户保证最新的元数据已经同步到了 Hive Metastore。 | false | +## 元数据缓存(4.0.4+) {#meta-cache-404} + +从 Doris 4.0.4 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 + +统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 + +### 缓存模块 {#meta-cache-404-modules} + +| 模块 | 属性键前缀 | 典型缓存内容 | +|---|---|---| +| `partition` | `meta.cache.hudi.partition.` | Hudi 分区相关元数据(用于分区发现/剪枝等)。 | +| `fs-view` | `meta.cache.hudi.fs-view.` | Hudi FS View 相关元数据。 | +| `meta-client` | `meta.cache.hudi.meta-client.` | Hudi Meta Client 相关元数据。 | + +示例(通过降低 capacity 控制缓存规模): + +```sql +ALTER CATALOG hudi_ctl SET PROPERTIES ( + "meta.cache.hudi.partition.capacity" = "2000" +); +``` + +### 可观测性 {#meta-cache-404-observability} + +Hudi 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 +系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + +Hudi 各模块对应的 `cache_name` 如下: + +| 模块 | cache_name | +|---|---| +| `partition` | `hudi_partition_cache` | +| `fs-view` | `hudi_fs_view_cache` | +| `meta-client` | `hudi_meta_client_cache` | + +示例: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hudi_ctl' + AND cache_name LIKE 'hudi_%' +ORDER BY cache_name, metric_name; +``` + ### 支持的 Hudi 版本 当前依赖的 Hudi 版本为 0.15。推荐访问 0.14 版本以上的 Hudi 数据。 @@ -226,4 +271,3 @@ SELECT * from hudi_table@incr('beginTime'='xxx', ['endTime'='xxx'], ['hoodie.rea | Doris 版本 | 功能支持 | | ----------- | ----------------------------------------- | | 2.1.8/3.0.4 | Hudi 依赖升级到 0.15。新增 Hadoop Hudi JNI Scanner。 | - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx index 018bc76a690be..743b88306ed2d 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx @@ -87,6 +87,49 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存(4.0.4+) {#meta-cache-404} + +从 Doris 4.0.4 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 + +统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 + +### 缓存模块 {#meta-cache-404-modules} + +| 模块 | 属性键前缀 | 典型缓存内容 | +|---|---|---| +| `table` | `meta.cache.iceberg.table.` | Iceberg 表元数据对象(减少 catalog/metastore 往返)。 | +| `manifest` | `meta.cache.iceberg.manifest.` | manifest 相关元数据(减少重复读取 manifest 的开销)。 | + +示例(缩短 manifest TTL,优先新鲜度): + +```sql +ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.ttl-second" = "600" +); +``` + +### 可观测性 {#meta-cache-404-observability} + +Iceberg 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 +系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + +Iceberg 各模块对应的 `cache_name` 如下: + +| 模块 | cache_name | +|---|---| +| `table` | `iceberg_table_cache` | +| `manifest` | `iceberg_manifest_cache` | + +示例: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'iceberg_ctl' + AND cache_name LIKE 'iceberg_%' +ORDER BY cache_name, metric_name; +``` + ### 支持的 Iceberg 版本 | Doris 版本 | Iceberg SDK 版本 | diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md index d86e2b4c12be1..e9f1436005945 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md @@ -111,6 +111,48 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中「通用属性」部分。 +## 元数据缓存(4.0.4+) {#meta-cache-404} + +从 Doris 4.0.4 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 + +统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 + +### 缓存模块 {#meta-cache-404-modules} + +| 模块 | 属性键前缀 | 典型缓存内容 | +|---|---|---| +| `partition-values` | `meta.cache.maxcompute.partition-values.` | 分区值列表(减少重复的远端枚举开销)。 | + +示例: + +```sql +ALTER CATALOG mc_ctl SET PROPERTIES ( + "meta.cache.maxcompute.partition-values.ttl-second" = "3600", + "meta.cache.maxcompute.partition-values.capacity" = "5000" +); +``` + +### 可观测性 {#meta-cache-404-observability} + +MaxCompute 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 +系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + +MaxCompute 模块对应的 `cache_name` 如下: + +| 模块 | cache_name | +|---|---| +| `partition-values` | `maxcompute_partition_values_cache` | + +示例: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'mc_ctl' + AND cache_name LIKE 'maxcompute_%' +ORDER BY cache_name, metric_name; +``` + ### 支持的 MaxCompute 版本 仅支持公有云版本的 MaxCompute。私有云版本支持请联系 Doris 社区支持。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx index e313fd451c644..01ce87bf4f65b 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx @@ -90,6 +90,47 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存(4.0.4+) {#meta-cache-404} + +从 Doris 4.0.4 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 + +统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 + +### 缓存模块 {#meta-cache-404-modules} + +| 模块 | 属性键前缀 | 典型缓存内容 | +|---|---|---| +| `table` | `meta.cache.paimon.table.` | Paimon 表元数据(用于查询规划,实际涉及 schema/snapshot/partition 等元数据加载)。 | + +示例(关闭 module 缓存,按需实时加载): + +```sql +ALTER CATALOG paimon_ctl SET PROPERTIES ( + "meta.cache.paimon.table.ttl-second" = "0" +); +``` + +### 可观测性 {#meta-cache-404-observability} + +Paimon 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 +系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + +Paimon 模块对应的 `cache_name` 如下: + +| 模块 | cache_name | +|---|---| +| `table` | `paimon_table_cache` | + +示例: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'paimon_ctl' + AND cache_name LIKE 'paimon_%' +ORDER BY cache_name, metric_name; +``` + ### 支持的 Paimon 版本 当前依赖的 Paimon 版本为 1.0.0。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md index 1c35945043b6d..0cb9d3664ce65 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md @@ -18,6 +18,11 @@ 该文档适用于 2.1.6 之后的版本。 ::: +:::note +对于 Doris 4.0.4 及之后版本,外表元数据缓存已重构并使用统一配置键 `meta.cache.*`。 +请参阅[统一外表元数据缓存(4.0.4+)](./meta-cache/unified-meta-cache.md)。 +::: + ## 缓存策略 大多数缓存都有如下三个策略指标: @@ -321,6 +326,12 @@ CREATE CATALOG hive PROPERTIES ( 对于所有类型的 External Catalog,如果希望实时可见最新的 Table Schema,可以关闭 Schema Cache: +:::note +从 Doris 4.0.4 开始,旧的 catalog 级缓存参数 `schema.cache.ttl-second` 已不再推荐使用。 +对于 4.0.4+,仍可使用下面的 FE 配置方式进行全局控制,并参考: +[统一外表元数据缓存(4.0.4+)](./meta-cache/unified-meta-cache.md)。 +::: + - 全局关闭 ```text @@ -341,6 +352,13 @@ CREATE CATALOG hive PROPERTIES ( 针对 Hive Catalog,如果想关闭缓存来查询到实时更新的数据,可以配置以下参数: +:::note +从 Doris 4.0.4 开始,旧的 catalog 级参数 `file.meta.cache.ttl-second` 和 `partition.cache.ttl-second` +已不再推荐使用。请改用统一键 `meta.cache.hive.*`,并参考: +[Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-404) 与 +[统一外表元数据缓存(4.0.4+)](./meta-cache/unified-meta-cache.md)。 +::: + - 全局关闭 ```text diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md new file mode 100644 index 0000000000000..e47a53f213732 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md @@ -0,0 +1,85 @@ +--- +{ + "title": "统一外表元数据缓存(4.0.4+)", + "language": "zh-CN", + "description": "面向用户的统一外表元数据缓存使用说明:统一配置键 meta.cache.*、缓存覆盖范围、以及各类 Catalog 的配置入口。" +} +--- + +从 **Doris 4.0.4** 开始,External Catalog 的外表元数据缓存能力进行了统一化重构。对用户来说,主要关注三件事: + +| 你需要关心的问题 | 对应入口 | +|---|---| +| 在哪里配置 | 在 Catalog `PROPERTIES` 里使用统一键 `meta.cache.*`(具体 module 见下方各 catalog 文档)。 | +| 影响哪些内容 | 取决于不同 catalog 引擎(分区信息、文件列表、表元数据、manifest 等)。 | +| 如何观测 | 通过 `information_schema.catalog_meta_cache_statistics` 查看指标(见本文观测章节)。 | + +:::tip +适用于 Doris 4.0.4 及之后版本。 +::: + +## 统一属性模型 + +各引擎缓存 module 使用统一的配置键格式: + +`meta.cache...{enable,ttl-second,capacity}` + +下表说明属性语义: + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存 module。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭;`-1` 表示永不过期;其他值表示按访问时间计算 TTL。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +示例(修改 catalog properties): + +```sql +ALTER CATALOG hive_ctl SET PROPERTIES ( + "meta.cache.hive.file.ttl-second" = "0" +); +``` + +## 外表 Meta Cache 覆盖范围 + +外表元数据缓存覆盖多种元数据类型。其中一部分由统一 `meta.cache.*` 键配置,另一部分由 FE 配置控制: + +| 类别 | 示例 | 配置方式 | +|---|---|---| +| 引擎 module 缓存 | Hive 分区/文件、Iceberg manifest、Paimon 表元数据等 | Catalog `PROPERTIES`:`meta.cache...*` | +| Schema cache | 表 schema(按版本 token 隔离) | FE 配置(例如:`max_external_schema_cache_num`) | + +## 各类 Catalog 的配置入口(链接) + +不同 Catalog 引擎支持的缓存 module 不同,具体 module、推荐配置与可观测性请参考对应 Catalog 文档: + +| Catalog 引擎 | module 缓存配置与可观测性 | +|---|---| +| Hive | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-404) | +| Iceberg | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-404) | +| Paimon | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-404) | +| Hudi | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-404) | +| MaxCompute | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-404) | + +## 观测方式 + +通过系统表统一观测缓存指标: + +```sql +SELECT * +FROM information_schema.catalog_meta_cache_statistics +ORDER BY catalog_name, cache_name, metric_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + +约定与常见指标: + +| 内容 | 说明 | +|---|---| +| `cache_name` | `__cache`(module 中的 `-` 会被替换为 `_`) | +| 常见指标 | `hit_ratio`、`hit_count`、`read_count`、`eviction_count`、`average_load_penalty`、`estimated_size` | + +## 旧参数迁移说明 + +从 Doris 4.0.4 开始,旧版 catalog cache 参数(例如 `schema.cache.ttl-second`、`file.meta.cache.ttl-second`)已不再推荐使用。请改用 `meta.cache.*` 统一键,并参考上文对应的 catalog 文档。 diff --git a/sidebars.ts b/sidebars.ts index 48c1d55f75218..9517e108c0942 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -455,6 +455,7 @@ const sidebars: SidebarsConfig = { }, 'lakehouse/data-cache', 'lakehouse/meta-cache', + 'lakehouse/meta-cache/unified-meta-cache', 'lakehouse/compute-node', 'lakehouse/statistics', { From f68ea937132b54480ed06918b68af488226a67da Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 10 Mar 2026 15:05:25 +0800 Subject: [PATCH 2/8] docs: clarify unified external meta cache docs --- docs/lakehouse/catalogs/hive-catalog.mdx | 10 +++++----- docs/lakehouse/catalogs/hudi-catalog.md | 10 +++++----- docs/lakehouse/catalogs/iceberg-catalog.mdx | 10 +++++----- docs/lakehouse/catalogs/maxcompute-catalog.md | 10 +++++----- docs/lakehouse/catalogs/paimon-catalog.mdx | 10 +++++----- docs/lakehouse/meta-cache.md | 16 ++++++++-------- .../lakehouse/meta-cache/unified-meta-cache.md | 18 +++++++++--------- .../lakehouse/catalogs/hive-catalog.mdx | 10 +++++----- .../current/lakehouse/catalogs/hudi-catalog.md | 10 +++++----- .../lakehouse/catalogs/iceberg-catalog.mdx | 10 +++++----- .../lakehouse/catalogs/maxcompute-catalog.md | 10 +++++----- .../lakehouse/catalogs/paimon-catalog.mdx | 10 +++++----- .../current/lakehouse/meta-cache.md | 16 ++++++++-------- .../lakehouse/meta-cache/unified-meta-cache.md | 18 +++++++++--------- 14 files changed, 84 insertions(+), 84 deletions(-) diff --git a/docs/lakehouse/catalogs/hive-catalog.mdx b/docs/lakehouse/catalogs/hive-catalog.mdx index ed7a79fc09d47..f4224aa49a74b 100644 --- a/docs/lakehouse/catalogs/hive-catalog.mdx +++ b/docs/lakehouse/catalogs/hive-catalog.mdx @@ -76,14 +76,14 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering common attributes. Please see the "Common Properties" section in the [Catalog Overview](../catalog-overview.md). -## Metadata Cache (4.0.4+) {#meta-cache-404} +## Metadata Cache (4.1.x+) {#meta-cache-unified} -Starting from Doris 4.0.4, Hive Catalog metadata caches are configured with the unified `meta.cache.*` properties. +Starting from Doris 4.1.x, Hive Catalog metadata caches are configured with the unified `meta.cache.*` properties. This section focuses on **how to use** and **how to observe** the Hive-related cache modules. -For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). +For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). -### Cache Modules {#meta-cache-404-modules} +### Cache Modules {#meta-cache-unified-modules} | Module | Property key prefix | Cached content (typical) | |---|---|---| @@ -99,7 +99,7 @@ ALTER CATALOG hive_ctl SET PROPERTIES ( ); ``` -### Observability {#meta-cache-404-observability} +### Observability {#meta-cache-unified-observability} Hive cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). diff --git a/docs/lakehouse/catalogs/hudi-catalog.md b/docs/lakehouse/catalogs/hudi-catalog.md index 1dcd347cc566a..1cf6645663971 100644 --- a/docs/lakehouse/catalogs/hudi-catalog.md +++ b/docs/lakehouse/catalogs/hudi-catalog.md @@ -51,14 +51,14 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | Whether to use the partition information already synchronized by Hive Metastore. If true, partition information will be obtained directly from Hive Metastore. Otherwise, it will be obtained from the metadata file of the file system. Obtaining information from Hive Metastore is more efficient, but users need to ensure that the latest metadata has been synchronized to Hive Metastore. | false | -## Metadata Cache (4.0.4+) {#meta-cache-404} +## Metadata Cache (4.1.x+) {#meta-cache-unified} -Starting from Doris 4.0.4, Hudi-related metadata caches are configured with the unified `meta.cache.*` properties. +Starting from Doris 4.1.x, Hudi-related metadata caches are configured with the unified `meta.cache.*` properties. This section focuses on **how to use** and **how to observe** the Hudi cache modules. -For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). +For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). -### Cache Modules {#meta-cache-404-modules} +### Cache Modules {#meta-cache-unified-modules} | Module | Property key prefix | Cached content (typical) | |---|---|---| @@ -74,7 +74,7 @@ ALTER CATALOG hudi_ctl SET PROPERTIES ( ); ``` -### Observability {#meta-cache-404-observability} +### Observability {#meta-cache-unified-observability} Hudi cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). diff --git a/docs/lakehouse/catalogs/iceberg-catalog.mdx b/docs/lakehouse/catalogs/iceberg-catalog.mdx index da090521381bf..f30c4a06899d7 100644 --- a/docs/lakehouse/catalogs/iceberg-catalog.mdx +++ b/docs/lakehouse/catalogs/iceberg-catalog.mdx @@ -85,14 +85,14 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering general properties. See the [Catalog Overview](../catalog-overview.md) for details on common properties. -## Metadata Cache (4.0.4+) {#meta-cache-404} +## Metadata Cache (4.1.x+) {#meta-cache-unified} -Starting from Doris 4.0.4, Iceberg Catalog metadata caches are configured with the unified `meta.cache.*` properties. +Starting from Doris 4.1.x, Iceberg Catalog metadata caches are configured with the unified `meta.cache.*` properties. This section focuses on **how to use** and **how to observe** the Iceberg-related cache modules. -For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). +For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). -### Cache Modules {#meta-cache-404-modules} +### Cache Modules {#meta-cache-unified-modules} | Module | Property key prefix | Cached content (typical) | |---|---|---| @@ -107,7 +107,7 @@ ALTER CATALOG iceberg_ctl SET PROPERTIES ( ); ``` -### Observability {#meta-cache-404-observability} +### Observability {#meta-cache-unified-observability} Iceberg cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). diff --git a/docs/lakehouse/catalogs/maxcompute-catalog.md b/docs/lakehouse/catalogs/maxcompute-catalog.md index cf22afa108dbd..2af924d84f258 100644 --- a/docs/lakehouse/catalogs/maxcompute-catalog.md +++ b/docs/lakehouse/catalogs/maxcompute-catalog.md @@ -111,14 +111,14 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the "Common Properties" section in [Catalog Overview](../catalog-overview.md). -## Metadata Cache (4.0.4+) {#meta-cache-404} +## Metadata Cache (4.1.x+) {#meta-cache-unified} -Starting from Doris 4.0.4, MaxCompute Catalog metadata caches are configured with the unified `meta.cache.*` properties. +Starting from Doris 4.1.x, MaxCompute Catalog metadata caches are configured with the unified `meta.cache.*` properties. This section focuses on **how to use** and **how to observe** the MaxCompute-related cache module. -For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). +For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). -### Cache Modules {#meta-cache-404-modules} +### Cache Modules {#meta-cache-unified-modules} | Module | Property key prefix | Cached content (typical) | |---|---|---| @@ -133,7 +133,7 @@ ALTER CATALOG mc_ctl SET PROPERTIES ( ); ``` -### Observability {#meta-cache-404-observability} +### Observability {#meta-cache-unified-observability} MaxCompute cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). diff --git a/docs/lakehouse/catalogs/paimon-catalog.mdx b/docs/lakehouse/catalogs/paimon-catalog.mdx index 0a6a183a40bdd..6dc6f4acca5a1 100644 --- a/docs/lakehouse/catalogs/paimon-catalog.mdx +++ b/docs/lakehouse/catalogs/paimon-catalog.mdx @@ -90,14 +90,14 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the [Catalog Overview](../catalog-overview.md) section on [Common Properties]. -## Metadata Cache (4.0.4+) {#meta-cache-404} +## Metadata Cache (4.1.x+) {#meta-cache-unified} -Starting from Doris 4.0.4, Paimon Catalog metadata caches are configured with the unified `meta.cache.*` properties. +Starting from Doris 4.1.x, Paimon Catalog metadata caches are configured with the unified `meta.cache.*` properties. This section focuses on **how to use** and **how to observe** the Paimon-related cache modules. -For the unified property semantics, see: [Unified External Meta Cache (4.0.4+)](../meta-cache/unified-meta-cache.md). +For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). -### Cache Modules {#meta-cache-404-modules} +### Cache Modules {#meta-cache-unified-modules} | Module | Property key prefix | Cached content (typical) | |---|---|---| @@ -111,7 +111,7 @@ ALTER CATALOG paimon_ctl SET PROPERTIES ( ); ``` -### Observability {#meta-cache-404-observability} +### Observability {#meta-cache-unified-observability} Paimon cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). diff --git a/docs/lakehouse/meta-cache.md b/docs/lakehouse/meta-cache.md index def91a4f57826..a16c738974a5a 100644 --- a/docs/lakehouse/meta-cache.md +++ b/docs/lakehouse/meta-cache.md @@ -19,8 +19,8 @@ This document applies to versions after 2.1.6. ::: :::note -For Doris 4.0.4 and later, external meta cache has been refactored with unified configuration keys `meta.cache.*`. -See [Unified External Meta Cache (4.0.4+)](./meta-cache/unified-meta-cache.md). +For Doris 4.1.x and later, external meta cache has been refactored with unified configuration keys `meta.cache.*`. +See [Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). ::: ## Cache Strategies @@ -327,9 +327,9 @@ This section mainly introduces the cache behavior that users may be concerned ab For all types of External Catalogs, if you want to see the latest Table Schema in real time, you can disable the Schema Cache: :::note -Starting from Doris 4.0.4, the legacy catalog-level cache property `schema.cache.ttl-second` is deprecated. -For 4.0.4+, keep using the FE config method below, and refer to: -[Unified External Meta Cache (4.0.4+)](./meta-cache/unified-meta-cache.md). +Starting from Doris 4.1.x, the legacy catalog-level cache property `schema.cache.ttl-second` is deprecated. +For 4.1.x+, keep using the FE config method below, and refer to: +[Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). ::: - Disable globally @@ -353,10 +353,10 @@ After setting, Doris will see the latest Table Schema in real time. However, thi For Hive Catalog, if you want to disable the cache to query real-time updated data, you can configure the following parameters: :::note -Starting from Doris 4.0.4, the legacy catalog-level properties `file.meta.cache.ttl-second` and `partition.cache.ttl-second` +Starting from Doris 4.1.x, the legacy catalog-level properties `file.meta.cache.ttl-second` and `partition.cache.ttl-second` are deprecated. Use unified `meta.cache.hive.*` properties instead. See: -[Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-404) and -[Unified External Meta Cache (4.0.4+)](./meta-cache/unified-meta-cache.md). +[Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-unified) and +[Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). ::: - Disable globally diff --git a/docs/lakehouse/meta-cache/unified-meta-cache.md b/docs/lakehouse/meta-cache/unified-meta-cache.md index 32a7579f539f2..3bc23173c44fe 100644 --- a/docs/lakehouse/meta-cache/unified-meta-cache.md +++ b/docs/lakehouse/meta-cache/unified-meta-cache.md @@ -1,12 +1,12 @@ --- { - "title": "Unified External Meta Cache (4.0.4+)", + "title": "Unified External Meta Cache (4.1.x+)", "language": "en", "description": "User guide for unified external metadata cache: unified meta.cache.* properties, what is cached, and where to configure per catalog." } --- -Starting from **Doris 4.0.4**, external metadata caching is unified for major External Catalog engines. As a user, you only need to know: +Starting from **Doris 4.1.x**, external metadata caching is unified for major External Catalog engines. As a user, you only need to know: | You want to know | Where in docs | |---|---| @@ -15,7 +15,7 @@ Starting from **Doris 4.0.4**, external metadata caching is unified for major Ex | How to observe | `information_schema.catalog_meta_cache_statistics` (see the observability section below). | :::tip -Applies to Doris 4.0.4 and later. +Applies to Doris 4.1.x and later. ::: ## Unified Property Model @@ -55,11 +55,11 @@ For each catalog engine, the supported cache modules and the recommended propert | Catalog engine | Where to configure module caches | |---|---| -| Hive | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-404) | -| Iceberg | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-404) | -| Paimon | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-404) | -| Hudi | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-404) | -| MaxCompute | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-404) | +| Hive | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-unified) | +| Iceberg | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-unified) | +| Paimon | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-unified) | +| Hudi | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-unified) | +| MaxCompute | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-unified) | ## Observability @@ -81,4 +81,4 @@ Naming convention: ## Migration Note (Legacy Properties) -Starting from Doris 4.0.4, legacy catalog cache properties (for example, `schema.cache.ttl-second`, `file.meta.cache.ttl-second`) are deprecated. Use `meta.cache.*` properties instead and follow the catalog-specific pages above. +Starting from Doris 4.1.x, legacy catalog cache properties (for example, `schema.cache.ttl-second`, `file.meta.cache.ttl-second`) are deprecated. Use `meta.cache.*` properties instead and follow the catalog-specific pages above. diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx index ce1e582e851c3..ab3cc43906673 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx @@ -78,13 +78,13 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[ 数据目录概述 ](../catalog-overview.md)中【通用属性】部分。 -## 元数据缓存(4.0.4+) {#meta-cache-404} +## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.0.4 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 -统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 +统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 -### 缓存模块 {#meta-cache-404-modules} +### 缓存模块 {#meta-cache-unified-modules} | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| @@ -100,7 +100,7 @@ ALTER CATALOG hive_ctl SET PROPERTIES ( ); ``` -### 可观测性 {#meta-cache-404-observability} +### 可观测性 {#meta-cache-unified-observability} Hive 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md index 0e52dfcdca59b..51eddfb08dc86 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md @@ -51,13 +51,13 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | 是否使用 Hive Metastore 已同步的分区信息。如果为 true,则会直接从 Hive Metastore 中获取分区信息。否则,会从文件系统的元数据文件中获取分区信息。通过 Hive Metastore 获取信息性能更好,但需要用户保证最新的元数据已经同步到了 Hive Metastore。 | false | -## 元数据缓存(4.0.4+) {#meta-cache-404} +## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.0.4 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 -统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 +统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 -### 缓存模块 {#meta-cache-404-modules} +### 缓存模块 {#meta-cache-unified-modules} | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| @@ -73,7 +73,7 @@ ALTER CATALOG hudi_ctl SET PROPERTIES ( ); ``` -### 可观测性 {#meta-cache-404-observability} +### 可观测性 {#meta-cache-unified-observability} Hudi 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx index 743b88306ed2d..142e6e7104c10 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx @@ -87,13 +87,13 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 -## 元数据缓存(4.0.4+) {#meta-cache-404} +## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.0.4 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 -统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 +统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 -### 缓存模块 {#meta-cache-404-modules} +### 缓存模块 {#meta-cache-unified-modules} | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| @@ -108,7 +108,7 @@ ALTER CATALOG iceberg_ctl SET PROPERTIES ( ); ``` -### 可观测性 {#meta-cache-404-observability} +### 可观测性 {#meta-cache-unified-observability} Iceberg 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md index e9f1436005945..418825065f6e7 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md @@ -111,13 +111,13 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中「通用属性」部分。 -## 元数据缓存(4.0.4+) {#meta-cache-404} +## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.0.4 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 -统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 +统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 -### 缓存模块 {#meta-cache-404-modules} +### 缓存模块 {#meta-cache-unified-modules} | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| @@ -132,7 +132,7 @@ ALTER CATALOG mc_ctl SET PROPERTIES ( ); ``` -### 可观测性 {#meta-cache-404-observability} +### 可观测性 {#meta-cache-unified-observability} MaxCompute 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx index 01ce87bf4f65b..b210df53b32c4 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx @@ -90,13 +90,13 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 -## 元数据缓存(4.0.4+) {#meta-cache-404} +## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.0.4 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 -统一属性语义可参阅:[统一外表元数据缓存(4.0.4+)](../meta-cache/unified-meta-cache.md)。 +统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 -### 缓存模块 {#meta-cache-404-modules} +### 缓存模块 {#meta-cache-unified-modules} | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| @@ -110,7 +110,7 @@ ALTER CATALOG paimon_ctl SET PROPERTIES ( ); ``` -### 可观测性 {#meta-cache-404-observability} +### 可观测性 {#meta-cache-unified-observability} Paimon 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md index 0cb9d3664ce65..26c6cf5dcbed2 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md @@ -19,8 +19,8 @@ ::: :::note -对于 Doris 4.0.4 及之后版本,外表元数据缓存已重构并使用统一配置键 `meta.cache.*`。 -请参阅[统一外表元数据缓存(4.0.4+)](./meta-cache/unified-meta-cache.md)。 +对于 Doris 4.1.x 及之后版本,外表元数据缓存已重构并使用统一配置键 `meta.cache.*`。 +请参阅[统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 ::: ## 缓存策略 @@ -327,9 +327,9 @@ CREATE CATALOG hive PROPERTIES ( 对于所有类型的 External Catalog,如果希望实时可见最新的 Table Schema,可以关闭 Schema Cache: :::note -从 Doris 4.0.4 开始,旧的 catalog 级缓存参数 `schema.cache.ttl-second` 已不再推荐使用。 -对于 4.0.4+,仍可使用下面的 FE 配置方式进行全局控制,并参考: -[统一外表元数据缓存(4.0.4+)](./meta-cache/unified-meta-cache.md)。 +从 Doris 4.1.x 开始,旧的 catalog 级缓存参数 `schema.cache.ttl-second` 已不再推荐使用。 +对于 4.1.x+,仍可使用下面的 FE 配置方式进行全局控制,并参考: +[统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 ::: - 全局关闭 @@ -353,10 +353,10 @@ CREATE CATALOG hive PROPERTIES ( 针对 Hive Catalog,如果想关闭缓存来查询到实时更新的数据,可以配置以下参数: :::note -从 Doris 4.0.4 开始,旧的 catalog 级参数 `file.meta.cache.ttl-second` 和 `partition.cache.ttl-second` +从 Doris 4.1.x 开始,旧的 catalog 级参数 `file.meta.cache.ttl-second` 和 `partition.cache.ttl-second` 已不再推荐使用。请改用统一键 `meta.cache.hive.*`,并参考: -[Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-404) 与 -[统一外表元数据缓存(4.0.4+)](./meta-cache/unified-meta-cache.md)。 +[Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-unified) 与 +[统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 ::: - 全局关闭 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md index e47a53f213732..94dfc574c287b 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md @@ -1,12 +1,12 @@ --- { - "title": "统一外表元数据缓存(4.0.4+)", + "title": "统一外表元数据缓存(4.1.x+)", "language": "zh-CN", "description": "面向用户的统一外表元数据缓存使用说明:统一配置键 meta.cache.*、缓存覆盖范围、以及各类 Catalog 的配置入口。" } --- -从 **Doris 4.0.4** 开始,External Catalog 的外表元数据缓存能力进行了统一化重构。对用户来说,主要关注三件事: +从 **Doris 4.1.x** 开始,External Catalog 的外表元数据缓存能力进行了统一化重构。对用户来说,主要关注三件事: | 你需要关心的问题 | 对应入口 | |---|---| @@ -15,7 +15,7 @@ | 如何观测 | 通过 `information_schema.catalog_meta_cache_statistics` 查看指标(见本文观测章节)。 | :::tip -适用于 Doris 4.0.4 及之后版本。 +适用于 Doris 4.1.x 及之后版本。 ::: ## 统一属性模型 @@ -55,11 +55,11 @@ ALTER CATALOG hive_ctl SET PROPERTIES ( | Catalog 引擎 | module 缓存配置与可观测性 | |---|---| -| Hive | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-404) | -| Iceberg | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-404) | -| Paimon | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-404) | -| Hudi | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-404) | -| MaxCompute | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-404) | +| Hive | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-unified) | +| Iceberg | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-unified) | +| Paimon | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-unified) | +| Hudi | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-unified) | +| MaxCompute | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-unified) | ## 观测方式 @@ -82,4 +82,4 @@ ORDER BY catalog_name, cache_name, metric_name; ## 旧参数迁移说明 -从 Doris 4.0.4 开始,旧版 catalog cache 参数(例如 `schema.cache.ttl-second`、`file.meta.cache.ttl-second`)已不再推荐使用。请改用 `meta.cache.*` 统一键,并参考上文对应的 catalog 文档。 +从 Doris 4.1.x 开始,旧版 catalog cache 参数(例如 `schema.cache.ttl-second`、`file.meta.cache.ttl-second`)已不再推荐使用。请改用 `meta.cache.*` 统一键,并参考上文对应的 catalog 文档。 From f40930e336cff6c22342c514c47db04041e332d8 Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 10 Mar 2026 15:53:18 +0800 Subject: [PATCH 3/8] docs: align unified external meta cache docs --- .../catalog_meta_cache_statistics.md | 72 ++++++++++--------- docs/lakehouse/catalogs/hive-catalog.mdx | 34 +++++---- docs/lakehouse/catalogs/hudi-catalog.md | 35 +++++---- docs/lakehouse/catalogs/iceberg-catalog.mdx | 32 ++++++--- docs/lakehouse/catalogs/maxcompute-catalog.md | 32 +++++---- docs/lakehouse/catalogs/paimon-catalog.mdx | 26 ++++--- docs/lakehouse/meta-cache.md | 7 ++ .../meta-cache/unified-meta-cache.md | 46 +++++++++--- .../catalog_meta_cache_statistics.md | 64 +++++++++-------- .../lakehouse/catalogs/hive-catalog.mdx | 35 +++++---- .../lakehouse/catalogs/hudi-catalog.md | 38 ++++++---- .../lakehouse/catalogs/iceberg-catalog.mdx | 35 ++++++--- .../lakehouse/catalogs/maxcompute-catalog.md | 35 ++++----- .../lakehouse/catalogs/paimon-catalog.mdx | 29 +++++--- .../current/lakehouse/meta-cache.md | 7 ++ .../meta-cache/unified-meta-cache.md | 47 +++++++++--- 16 files changed, 373 insertions(+), 201 deletions(-) diff --git a/docs/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md b/docs/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md index 53b2af25b4358..69d9388eb079b 100644 --- a/docs/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md +++ b/docs/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md @@ -18,46 +18,48 @@ View the metadata cache information of the External Catalog in the currently con ## Table Information -| Column Name | Type | Description | -| ------------ | ---- | ----------------------- | -| CATALOG_NAME | text | The name of the Catalog | -| CACHE_NAME | text | The name of the cache | -| METRIC_NAME | text | The name of the metric | -| METRIC_VALUE | text | The value of the metric | +One row represents one cache entry on one FE for one external catalog. + +| Column Name | Type | Description | +| ------------ | ---- | ----------- | +| FE_HOST | text | FE host that reports the stats | +| CATALOG_NAME | text | Catalog name | +| ENGINE_NAME | text | Meta cache engine name, such as `hive`, `iceberg`, `paimon` | +| ENTRY_NAME | text | Cache entry name inside the engine, such as `schema`, `file`, `manifest` | +| EFFECTIVE_ENABLED | boolean | Whether the cache is effectively enabled after evaluating `enable` / `ttl-second` / `capacity` | +| CONFIG_ENABLED | boolean | Raw `enable` flag from the cache config | +| AUTO_REFRESH | boolean | Whether async refresh-after-write is enabled for this entry | +| TTL_SECOND | bigint | TTL in seconds. `0` means disabled; `-1` means no expiration | +| CAPACITY | bigint | Max entry count | +| ESTIMATED_SIZE | bigint | Estimated current cache size | +| REQUEST_COUNT | bigint | Total requests | +| HIT_COUNT | bigint | Cache hits | +| MISS_COUNT | bigint | Cache misses | +| HIT_RATE | double | Hit rate | +| LOAD_SUCCESS_COUNT | bigint | Successful loads | +| LOAD_FAILURE_COUNT | bigint | Failed loads | +| TOTAL_LOAD_TIME_MS | bigint | Total load time in milliseconds | +| AVG_LOAD_PENALTY_MS | double | Average load time in milliseconds | +| EVICTION_COUNT | bigint | Evicted entries | +| INVALIDATE_COUNT | bigint | Explicit invalidations | +| LAST_LOAD_SUCCESS_TIME | text | Last successful load time | +| LAST_LOAD_FAILURE_TIME | text | Last failed load time | +| LAST_ERROR | text | Latest load error message | ## Usage Example -```text -+----------------------+-----------------------------+----------------------+---------------------+ -| CATALOG_NAME | CACHE_NAME | METRIC_NAME | METRIC_VALUE | -+----------------------+-----------------------------+----------------------+---------------------+ -| hive_iceberg_minio | iceberg_table_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_table_cache | hit_ratio | 0.8235294117647058 | -| hive_iceberg_minio | iceberg_table_cache | average_load_penalty | 5.480102048333334E8 | -| hive_iceberg_minio | iceberg_table_cache | estimated_size | 6 | -| hive_iceberg_minio | iceberg_table_cache | hit_count | 28 | -| hive_iceberg_minio | iceberg_table_cache | read_count | 34 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | hit_ratio | 1.0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | average_load_penalty | 0.0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | estimated_size | 0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | hit_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | read_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | hit_ratio | 0.45454545454545453 | -| hive_iceberg_minio | iceberg_snapshot_cache | average_load_penalty | 5.604907246666666E8 | -| hive_iceberg_minio | iceberg_snapshot_cache | estimated_size | 6 | -| hive_iceberg_minio | iceberg_snapshot_cache | hit_count | 5 | -| hive_iceberg_minio | iceberg_snapshot_cache | read_count | 11 | +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, last_error +FROM information_schema.catalog_meta_cache_statistics +ORDER BY catalog_name, engine_name, entry_name; ``` -The METRIC_NAME column contains the following Caffeine cache performance metrics: -- eviction_count: The number of entries that have been evicted from the cache -- hit_ratio: The ratio of cache requests which were hits (ranges from 0.0 to 1.0) -- average_load_penalty: The average time spent loading new values (in nanoseconds) -- estimated_size: The approximate number of entries in the cache -- hit_count: The number of times cache lookup methods have returned a cached value -- read_count: The total number of times cache lookup methods have been called +Typical usage: +- Use `ENGINE_NAME` + `ENTRY_NAME` to identify one logical cache entry. +- Use `EFFECTIVE_ENABLED`, `TTL_SECOND`, and `CAPACITY` to confirm the applied cache policy. +- Use `HIT_RATE`, `ESTIMATED_SIZE`, `LOAD_FAILURE_COUNT`, and `LAST_ERROR` to diagnose behavior. diff --git a/docs/lakehouse/catalogs/hive-catalog.mdx b/docs/lakehouse/catalogs/hive-catalog.mdx index f4224aa49a74b..3e17dde7901d0 100644 --- a/docs/lakehouse/catalogs/hive-catalog.mdx +++ b/docs/lakehouse/catalogs/hive-catalog.mdx @@ -79,7 +79,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## Metadata Cache (4.1.x+) {#meta-cache-unified} Starting from Doris 4.1.x, Hive Catalog metadata caches are configured with the unified `meta.cache.*` properties. -This section focuses on **how to use** and **how to observe** the Hive-related cache modules. +This section covers configuration and observability for Hive-related cache modules. For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). @@ -87,11 +87,18 @@ For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)]( | Module | Property key prefix | Cached content (typical) | |---|---|---| -| `partition-values` | `meta.cache.hive.partition-values.` | Partition values/names list used by partition pruning and partition enumeration. | +| `schema` | `meta.cache.hive.schema.` | Schema cache entry for table schema loading. | +| `partition_values` | `meta.cache.hive.partition_values.` | Partition values/names list used by partition pruning and partition enumeration. | | `partition` | `meta.cache.hive.partition.` | Partition properties (location, input format, storage descriptor, etc.). | | `file` | `meta.cache.hive.file.` | File listing under partition/table paths (reduces remote LIST overhead). | -Example (disable file listing cache for freshness): +Notes: + +- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. +- Changing legacy properties such as `file.meta.cache.ttl-second` and `partition.cache.ttl-second` can trigger Hive cache rebuild behavior. +- Changing unified `meta.cache.hive.*` properties on an already-initialized catalog does not fully hot-reload existing Hive cache entries in current releases. To guarantee that a new cache spec is applied, recreate the catalog or restart FE. + +Example: ```sql ALTER CATALOG hive_ctl SET PROPERTIES ( @@ -104,22 +111,25 @@ ALTER CATALOG hive_ctl SET PROPERTIES ( Hive cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). -The `cache_name` values for Hive modules are: +Common Hive entries: -| Module | cache_name | +| Entry | Meaning | |---|---| -| `partition-values` | `hive_partition_values_cache` | -| `partition` | `hive_partition_cache` | -| `file` | `hive_file_cache` | +| `schema` | Schema cache entry | +| `partition_values` | Partition names / values cache entry | +| `partition` | Partition property cache entry | +| `file` | File listing cache entry | -Example query (filter one catalog and Hive caches): +Example query: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'hive_ctl' - AND cache_name LIKE 'hive_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'hive' +ORDER BY entry_name; ``` ### Supported Hive Versions diff --git a/docs/lakehouse/catalogs/hudi-catalog.md b/docs/lakehouse/catalogs/hudi-catalog.md index 1cf6645663971..eaf5046e2734f 100644 --- a/docs/lakehouse/catalogs/hudi-catalog.md +++ b/docs/lakehouse/catalogs/hudi-catalog.md @@ -54,7 +54,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## Metadata Cache (4.1.x+) {#meta-cache-unified} Starting from Doris 4.1.x, Hudi-related metadata caches are configured with the unified `meta.cache.*` properties. -This section focuses on **how to use** and **how to observe** the Hudi cache modules. +This section covers configuration and observability for Hudi-related cache modules. For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). @@ -62,15 +62,21 @@ For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)]( | Module | Property key prefix | Cached content (typical) | |---|---|---| +| `schema` | `meta.cache.hudi.schema.` | Schema cache entry for table schema loading. | | `partition` | `meta.cache.hudi.partition.` | Hudi partition-related metadata (used by partition discovery/pruning). | -| `fs-view` | `meta.cache.hudi.fs-view.` | Hudi filesystem view related metadata. | -| `meta-client` | `meta.cache.hudi.meta-client.` | Hudi meta client related metadata. | +| `fs_view` | `meta.cache.hudi.fs_view.` | Hudi filesystem view related metadata. | +| `meta_client` | `meta.cache.hudi.meta_client.` | Hudi meta client related metadata. | -Example (reduce cache footprint by lowering capacity): +Notes: + +- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. +- When Hudi tables are accessed through an HMS catalog, configure `meta.cache.hudi.*` on that HMS catalog. + +Example: ```sql ALTER CATALOG hudi_ctl SET PROPERTIES ( - "meta.cache.hudi.partition.capacity" = "2000" + "meta.cache.hudi.fs_view.capacity" = "2000" ); ``` @@ -79,22 +85,25 @@ ALTER CATALOG hudi_ctl SET PROPERTIES ( Hudi cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). -The `cache_name` values for Hudi modules are: +Common Hudi entries: -| Module | cache_name | +| Entry | Meaning | |---|---| -| `partition` | `hudi_partition_cache` | -| `fs-view` | `hudi_fs_view_cache` | -| `meta-client` | `hudi_meta_client_cache` | +| `schema` | Schema cache entry | +| `partition` | Partition metadata cache entry | +| `fs_view` | File system view cache entry | +| `meta_client` | Meta client cache entry | Example query: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'hudi_ctl' - AND cache_name LIKE 'hudi_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'hudi' +ORDER BY entry_name; ``` ### Supported Hudi Versions diff --git a/docs/lakehouse/catalogs/iceberg-catalog.mdx b/docs/lakehouse/catalogs/iceberg-catalog.mdx index f30c4a06899d7..a54ec699b00b7 100644 --- a/docs/lakehouse/catalogs/iceberg-catalog.mdx +++ b/docs/lakehouse/catalogs/iceberg-catalog.mdx @@ -88,7 +88,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## Metadata Cache (4.1.x+) {#meta-cache-unified} Starting from Doris 4.1.x, Iceberg Catalog metadata caches are configured with the unified `meta.cache.*` properties. -This section focuses on **how to use** and **how to observe** the Iceberg-related cache modules. +This section covers configuration and observability for Iceberg-related cache modules. For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). @@ -96,13 +96,23 @@ For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)]( | Module | Property key prefix | Cached content (typical) | |---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | Schema cache entry for table schema loading. | | `table` | `meta.cache.iceberg.table.` | Iceberg table metadata object (reduces catalog/metastore round trips). | +| `view` | `meta.cache.iceberg.view.` | Iceberg view metadata object. | | `manifest` | `meta.cache.iceberg.manifest.` | Manifest-related metadata (reduces repeated manifest access overhead). | -Example (shorter TTL for manifest to prioritize freshness): +Notes: + +- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. +- `manifest` is disabled by default in the current implementation. Enable it explicitly before tuning TTL/capacity. +- `view` entries are only populated when Doris accesses Iceberg views. +- `ALTER CATALOG ... SET PROPERTIES` updates are applied through the unified hot-reload path. + +Example: ```sql ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", "meta.cache.iceberg.manifest.ttl-second" = "600" ); ``` @@ -112,21 +122,25 @@ ALTER CATALOG iceberg_ctl SET PROPERTIES ( Iceberg cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). -The `cache_name` values for Iceberg modules are: +Common Iceberg entries: -| Module | cache_name | +| Entry | Meaning | |---|---| -| `table` | `iceberg_table_cache` | -| `manifest` | `iceberg_manifest_cache` | +| `schema` | Schema cache entry | +| `table` | Table metadata cache entry | +| `view` | View metadata cache entry | +| `manifest` | Manifest payload cache entry | Example query: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'iceberg_ctl' - AND cache_name LIKE 'iceberg_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'iceberg' +ORDER BY entry_name; ``` ### Supported Iceberg Versions diff --git a/docs/lakehouse/catalogs/maxcompute-catalog.md b/docs/lakehouse/catalogs/maxcompute-catalog.md index 2af924d84f258..186f2cc41e67f 100644 --- a/docs/lakehouse/catalogs/maxcompute-catalog.md +++ b/docs/lakehouse/catalogs/maxcompute-catalog.md @@ -114,7 +114,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## Metadata Cache (4.1.x+) {#meta-cache-unified} Starting from Doris 4.1.x, MaxCompute Catalog metadata caches are configured with the unified `meta.cache.*` properties. -This section focuses on **how to use** and **how to observe** the MaxCompute-related cache module. +This section covers configuration and observability for MaxCompute-related cache modules. For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). @@ -122,36 +122,38 @@ For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)]( | Module | Property key prefix | Cached content (typical) | |---|---|---| -| `partition-values` | `meta.cache.maxcompute.partition-values.` | Partition values list (reduces repeated remote listing overhead). | +| `schema` | `meta.cache.maxcompute.schema.` | Schema cache entry for table schema loading. | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | Partition values cache entry used for partition pruning and partition enumeration. | -Example: +Notes: -```sql -ALTER CATALOG mc_ctl SET PROPERTIES ( - "meta.cache.maxcompute.partition-values.ttl-second" = "3600", - "meta.cache.maxcompute.partition-values.capacity" = "5000" -); -``` +- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. +- `partition_values` is configured through `meta.cache.maxcompute.partition_values.*`. +- The stats table exposes `partition_values` and `schema` as the two MaxCompute entries. +- There is no dedicated MaxCompute catalog-level hot-reload hook for `meta.cache.maxcompute.*`. ### Observability {#meta-cache-unified-observability} MaxCompute cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). -The `cache_name` value for MaxCompute module is: +Common MaxCompute entries: -| Module | cache_name | +| Entry | Meaning | |---|---| -| `partition-values` | `maxcompute_partition_values_cache` | +| `schema` | Schema cache entry | +| `partition_values` | Partition values cache entry | Example query: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'mc_ctl' - AND cache_name LIKE 'maxcompute_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'maxcompute' +ORDER BY entry_name; ``` ### Supported MaxCompute Versions diff --git a/docs/lakehouse/catalogs/paimon-catalog.mdx b/docs/lakehouse/catalogs/paimon-catalog.mdx index 6dc6f4acca5a1..0f2e5d0d97d6a 100644 --- a/docs/lakehouse/catalogs/paimon-catalog.mdx +++ b/docs/lakehouse/catalogs/paimon-catalog.mdx @@ -93,7 +93,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## Metadata Cache (4.1.x+) {#meta-cache-unified} Starting from Doris 4.1.x, Paimon Catalog metadata caches are configured with the unified `meta.cache.*` properties. -This section focuses on **how to use** and **how to observe** the Paimon-related cache modules. +This section covers configuration and observability for Paimon-related cache modules. For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). @@ -101,9 +101,16 @@ For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)]( | Module | Property key prefix | Cached content (typical) | |---|---|---| +| `schema` | `meta.cache.paimon.schema.` | Schema cache entry for table schema loading. | | `table` | `meta.cache.paimon.table.` | Paimon table metadata used for query planning (schema/snapshot/partition related metadata, depending on workload). | -Example (disable module cache and always load on demand): +Notes: + +- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. +- `schema` and `table` are separate entries. `schema` uses FE defaults unless `meta.cache.paimon.schema.*` is configured on the catalog. +- `ALTER CATALOG ... SET PROPERTIES` updates are applied through the unified hot-reload path. + +Example: ```sql ALTER CATALOG paimon_ctl SET PROPERTIES ( @@ -116,20 +123,23 @@ ALTER CATALOG paimon_ctl SET PROPERTIES ( Paimon cache metrics are available in `information_schema.catalog_meta_cache_statistics`. For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). -The `cache_name` value for Paimon module is: +Common Paimon entries: -| Module | cache_name | +| Entry | Meaning | |---|---| -| `table` | `paimon_table_cache` | +| `schema` | Schema cache entry | +| `table` | Table metadata cache entry | Example query: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'paimon_ctl' - AND cache_name LIKE 'paimon_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'paimon' +ORDER BY entry_name; ``` ### Supported Paimon Versions diff --git a/docs/lakehouse/meta-cache.md b/docs/lakehouse/meta-cache.md index a16c738974a5a..9a11f4aa84135 100644 --- a/docs/lakehouse/meta-cache.md +++ b/docs/lakehouse/meta-cache.md @@ -21,6 +21,13 @@ This document applies to versions after 2.1.6. :::note For Doris 4.1.x and later, external meta cache has been refactored with unified configuration keys `meta.cache.*`. See [Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). + +Starting from Doris 4.1.x, external metadata caching can be understood as two layers: + +- Generic catalog caches: database/table name lists and database/table objects. These are still controlled by FE configs such as `max_meta_object_cache_num`, `external_cache_refresh_time_minutes`, and `external_cache_expire_time_seconds_after_access`. +- Engine-specific entry caches: schema, partition metadata, manifests, file lists, and similar engine-dependent entries. These use unified per-catalog keys in the form `meta.cache...{enable,ttl-second,capacity}`. + +The unified document focuses on the second layer. ::: ## Cache Strategies diff --git a/docs/lakehouse/meta-cache/unified-meta-cache.md b/docs/lakehouse/meta-cache/unified-meta-cache.md index 3bc23173c44fe..770ec24fb4198 100644 --- a/docs/lakehouse/meta-cache/unified-meta-cache.md +++ b/docs/lakehouse/meta-cache/unified-meta-cache.md @@ -20,7 +20,7 @@ Applies to Doris 4.1.x and later. ## Unified Property Model -All engine cache modules share the same property key pattern: +All engine cache entries share the same property key pattern: `meta.cache...{enable,ttl-second,capacity}` @@ -32,7 +32,12 @@ The following table describes the property semantics: | `ttl-second` | `600`, `0`, `-1` | `0` disables the module; `-1` means no expiration; otherwise expire after access by TTL. | | `capacity` | `10000` | Max entry count (count-based). `0` disables the module. | -Example (edit catalog properties): +Notes: + +- `` uses the cache entry name shown in the catalog documentation and the stats table, for example `partition_values`, `fs_view`, `meta_client`. +- There is currently no per-entry refresh interval property. Async refresh behavior still uses the FE config `external_cache_refresh_time_minutes`. + +Example: ```sql ALTER CATALOG hive_ctl SET PROPERTIES ( @@ -42,12 +47,29 @@ ALTER CATALOG hive_ctl SET PROPERTIES ( ## What External Meta Cache Includes -External meta cache covers different kinds of metadata. Some are configured by unified catalog properties, and some are controlled by FE configs: +There are two layers of metadata caching that are easy to confuse: + +- Catalog object/name caches: `SHOW DATABASES`, `SHOW TABLES`, database objects, table objects, and related generic caches described in [Metadata Cache](../meta-cache.md). +- Engine entry caches: engine-specific runtime metadata such as Hive partitions/files, Iceberg manifests, Paimon table handles, and schema entries. This page focuses on this layer. + +External meta cache entries cover different kinds of metadata. Some are configured by unified catalog properties, and some also inherit FE-level defaults: | Category | Examples | How to configure | |---|---|---| -| Engine module caches | Hive partitions/files, Iceberg manifests, Paimon table metadata, etc. | Catalog `PROPERTIES`: `meta.cache...*` | -| Schema cache | Table schema, isolated by schema version token | FE configs (for example: `max_external_schema_cache_num`) | +| Engine entry caches | Hive `partition_values` / `partition` / `file`, Iceberg `manifest`, Paimon `table`, etc. | Catalog `PROPERTIES`: `meta.cache...*` | +| Schema cache | Per-engine `schema` entry, isolated by schema version token | FE configs provide defaults; catalog `meta.cache..schema.*` can override them | + +## Support Matrix + +The following table summarizes the current implementation: + +| Engine | Entries you will see in stats | Property key prefix | `ALTER CATALOG ... SET PROPERTIES` hot-reload | +|---|---|---|---| +| Hive | `schema`, `partition_values`, `partition`, `file` | `meta.cache.hive..*` | Changes to `meta.cache.hive.*` are not applied through the unified hot-reload path; recreate the catalog or restart FE to apply new specs | +| Iceberg | `schema`, `table`, `view`, `manifest` | `meta.cache.iceberg..*` | Supported | +| Paimon | `schema`, `table` | `meta.cache.paimon..*` | Supported | +| Hudi | `schema`, `partition`, `fs_view`, `meta_client` | `meta.cache.hudi..*` | Supported through HMS catalog property updates | +| MaxCompute | `schema`, `partition_values` | `meta.cache.maxcompute..*` | No dedicated hot-reload hook | ## Catalog-Specific Configuration (Links) @@ -66,18 +88,24 @@ For each catalog engine, the supported cache modules and the recommended propert Use the system table to observe cache metrics: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -ORDER BY catalog_name, cache_name, metric_name; +ORDER BY catalog_name, engine_name, entry_name; ``` This table is documented at: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). -Naming convention: +Read the table as follows: | Field | Convention | |---|---| -| `cache_name` | `__cache` (module `-` is converted to `_`) | +| `ENGINE_NAME` | Cache engine, such as `hive` or `iceberg` | +| `ENTRY_NAME` | Exact entry name used by that engine, such as `partition_values`, `fs_view`, `manifest` | +| `EFFECTIVE_ENABLED` | Final enable state after evaluating `enable`, `ttl-second`, and `capacity` | + +Common queries filter by `catalog_name` and `engine_name`. This table no longer uses the old `cache_name` / `metric_name` pivoted model. ## Migration Note (Legacy Properties) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md index b2deb486b69d3..4aa3cff765b86 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md @@ -18,41 +18,47 @@ ## 表信息 -| 列名 | 类型 | 说明 | +该表中一行表示“某个 FE 上、某个 external catalog 的一个 cache entry”的统计快照。 + +| 列名 | 类型 | 说明 | | :----------- | :--- | :----------- | +| FE_HOST | text | 上报该统计的 FE 主机 | | CATALOG_NAME | text | Catalog 名字 | -| CACHE_NAME | text | 缓存名字 | -| METRIC_NAME | text | 指标名字 | -| METRIC_VALUE | text | 指标值 | +| ENGINE_NAME | text | Meta cache 引擎名,如 `hive`、`iceberg`、`paimon` | +| ENTRY_NAME | text | 引擎内部的 cache entry 名,如 `schema`、`file`、`manifest` | +| EFFECTIVE_ENABLED | boolean | 综合 `enable` / `ttl-second` / `capacity` 后,该缓存是否真正生效 | +| CONFIG_ENABLED | boolean | 配置中的原始 `enable` 值 | +| AUTO_REFRESH | boolean | 该 entry 是否启用异步 refresh-after-write | +| TTL_SECOND | bigint | TTL 秒数。`0` 表示关闭,`-1` 表示永不过期 | +| CAPACITY | bigint | 最大条目数 | +| ESTIMATED_SIZE | bigint | 当前缓存条目估计数 | +| REQUEST_COUNT | bigint | 总请求数 | +| HIT_COUNT | bigint | 命中次数 | +| MISS_COUNT | bigint | 未命中次数 | +| HIT_RATE | double | 命中率 | +| LOAD_SUCCESS_COUNT | bigint | 成功加载次数 | +| LOAD_FAILURE_COUNT | bigint | 失败加载次数 | +| TOTAL_LOAD_TIME_MS | bigint | 总加载耗时,单位毫秒 | +| AVG_LOAD_PENALTY_MS | double | 平均加载耗时,单位毫秒 | +| EVICTION_COUNT | bigint | 被驱逐条目数 | +| INVALIDATE_COUNT | bigint | 显式失效次数 | +| LAST_LOAD_SUCCESS_TIME | text | 最近一次成功加载时间 | +| LAST_LOAD_FAILURE_TIME | text | 最近一次失败加载时间 | +| LAST_ERROR | text | 最近一次加载失败错误信息 | ## 使用示例 -```text -mysql> select * from catalog_meta_cache_statistics; -+----------------------+-----------------------------+----------------------+----------------------+ -| CATALOG_NAME | CACHE_NAME | METRIC_NAME | METRIC_VALUE | -+----------------------+-----------------------------+----------------------+----------------------+ -| hive_iceberg_minio | iceberg_table_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_table_cache | hit_ratio | 0.2413793103448276 | -| hive_iceberg_minio | iceberg_table_cache | average_load_penalty | 2.4654859845454547E8 | -| hive_iceberg_minio | iceberg_table_cache | estimated_size | 22 | -| hive_iceberg_minio | iceberg_table_cache | hit_count | 7 | -| hive_iceberg_minio | iceberg_table_cache | read_count | 29 | -| hive_iceberg_minio | iceberg_snapshot_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | hit_ratio | 1.0 | -| hive_iceberg_minio | iceberg_snapshot_cache | average_load_penalty | 0.0 | -| hive_iceberg_minio | iceberg_snapshot_cache | estimated_size | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | hit_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | read_count | 0 | -+----------------------+-----------------------------+----------------------+----------------------+ +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, last_error +FROM information_schema.catalog_meta_cache_statistics +ORDER BY catalog_name, engine_name, entry_name; ``` -METRIC_NAME 列包含以下 Caffeine 缓存性能指标: +常见用法: -- eviction_count:从缓存中驱逐的条目数量 -- hit_ratio:缓存命中率,范围从 0.0 到 1.0 -- average_load_penalty:加载新值的平均耗时(纳秒) -- estimated_size:缓存中条目的估计数量 -- hit_count:缓存查找方法返回缓存值的次数 -- read_count:缓存查找方法被调用的总次数 \ No newline at end of file +- 用 `ENGINE_NAME` + `ENTRY_NAME` 定位具体的逻辑缓存。 +- 用 `EFFECTIVE_ENABLED`、`TTL_SECOND`、`CAPACITY` 确认实际生效的缓存策略。 +- 用 `HIT_RATE`、`ESTIMATED_SIZE`、`LOAD_FAILURE_COUNT`、`LAST_ERROR` 排查缓存行为。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx index ab3cc43906673..40d4a93b842c6 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx @@ -80,7 +80,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.1.x 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +本节说明 Hive 相关 cache 模块的配置与观测方式。 统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 @@ -88,11 +89,18 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| -| `partition-values` | `meta.cache.hive.partition-values.` | 分区值/分区名称列表(常用于分区剪枝与分区枚举)。 | +| `schema` | `meta.cache.hive.schema.` | 表 schema 加载对应的 schema cache entry。 | +| `partition_values` | `meta.cache.hive.partition_values.` | 分区值/分区名称列表(常用于分区剪枝与分区枚举)。 | | `partition` | `meta.cache.hive.partition.` | 分区属性(location、输入格式、存储描述等)。 | | `file` | `meta.cache.hive.file.` | 分区/表路径下的文件列表(减少远端 LIST 开销)。 | -示例(为保证新鲜度,关闭文件列表缓存): +说明: + +- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 +- 修改旧参数 `file.meta.cache.ttl-second`、`partition.cache.ttl-second` 时,可以触发 Hive cache 重建相关路径。 +- 对已经初始化的 catalog,修改统一键 `meta.cache.hive.*` 时,当前版本不会完整热更新已有 Hive cache entry。要确保新配置生效,需要重建 catalog 或重启 FE。 + +示例: ```sql ALTER CATALOG hive_ctl SET PROPERTIES ( @@ -105,22 +113,25 @@ ALTER CATALOG hive_ctl SET PROPERTIES ( Hive 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 -Hive 各模块对应的 `cache_name` 如下: +Hive 常见 entry: -| 模块 | cache_name | +| Entry | 含义 | |---|---| -| `partition-values` | `hive_partition_values_cache` | -| `partition` | `hive_partition_cache` | -| `file` | `hive_file_cache` | +| `schema` | Schema cache entry | +| `partition_values` | 分区名称 / 分区值缓存 entry | +| `partition` | 分区属性缓存 entry | +| `file` | 文件列表缓存 entry | -示例(只看某个 catalog 的 Hive 缓存): +示例查询: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'hive_ctl' - AND cache_name LIKE 'hive_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'hive' +ORDER BY entry_name; ``` ### 支持的 Hive 版本 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md index 51eddfb08dc86..98e151ed3646b 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md @@ -53,7 +53,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.1.x 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +本节说明 Hudi 相关 cache 模块的配置与观测方式。 统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 @@ -61,15 +62,21 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| +| `schema` | `meta.cache.hudi.schema.` | 表 schema 加载对应的 schema cache entry。 | | `partition` | `meta.cache.hudi.partition.` | Hudi 分区相关元数据(用于分区发现/剪枝等)。 | -| `fs-view` | `meta.cache.hudi.fs-view.` | Hudi FS View 相关元数据。 | -| `meta-client` | `meta.cache.hudi.meta-client.` | Hudi Meta Client 相关元数据。 | +| `fs_view` | `meta.cache.hudi.fs_view.` | Hudi FS View 相关元数据。 | +| `meta_client` | `meta.cache.hudi.meta_client.` | Hudi Meta Client 相关元数据。 | -示例(通过降低 capacity 控制缓存规模): +说明: + +- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 +- 如果 Hudi 表是通过 HMS catalog 提供访问的,`meta.cache.hudi.*` 也配置在该 HMS catalog 上。 + +示例: ```sql ALTER CATALOG hudi_ctl SET PROPERTIES ( - "meta.cache.hudi.partition.capacity" = "2000" + "meta.cache.hudi.fs_view.capacity" = "2000" ); ``` @@ -78,22 +85,25 @@ ALTER CATALOG hudi_ctl SET PROPERTIES ( Hudi 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 -Hudi 各模块对应的 `cache_name` 如下: +Hudi 常见 entry: -| 模块 | cache_name | +| Entry | 含义 | |---|---| -| `partition` | `hudi_partition_cache` | -| `fs-view` | `hudi_fs_view_cache` | -| `meta-client` | `hudi_meta_client_cache` | +| `schema` | Schema cache entry | +| `partition` | 分区元数据缓存 entry | +| `fs_view` | FS View 缓存 entry | +| `meta_client` | Meta Client 缓存 entry | -示例: +示例查询: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'hudi_ctl' - AND cache_name LIKE 'hudi_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'hudi' +ORDER BY entry_name; ``` ### 支持的 Hudi 版本 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx index 142e6e7104c10..d7b1c05c92956 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx @@ -89,7 +89,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.1.x 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +本节说明 Iceberg 相关 cache 模块的配置与观测方式。 统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 @@ -97,13 +98,23 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | 表 schema 加载对应的 schema cache entry。 | | `table` | `meta.cache.iceberg.table.` | Iceberg 表元数据对象(减少 catalog/metastore 往返)。 | +| `view` | `meta.cache.iceberg.view.` | Iceberg View 元数据对象。 | | `manifest` | `meta.cache.iceberg.manifest.` | manifest 相关元数据(减少重复读取 manifest 的开销)。 | -示例(缩短 manifest TTL,优先新鲜度): +说明: + +- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 +- 当前实现中,`manifest` 默认是关闭的,调 TTL / capacity 之前要先显式打开。 +- `view` entry 只有在 Doris 访问 Iceberg View 时才会出现。 +- `ALTER CATALOG ... SET PROPERTIES` 的修改通过统一热生效路径应用。 + +示例: ```sql ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", "meta.cache.iceberg.manifest.ttl-second" = "600" ); ``` @@ -113,21 +124,25 @@ ALTER CATALOG iceberg_ctl SET PROPERTIES ( Iceberg 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 -Iceberg 各模块对应的 `cache_name` 如下: +Iceberg 常见 entry: -| 模块 | cache_name | +| Entry | 含义 | |---|---| -| `table` | `iceberg_table_cache` | -| `manifest` | `iceberg_manifest_cache` | +| `schema` | Schema cache entry | +| `table` | 表元数据缓存 entry | +| `view` | View 元数据缓存 entry | +| `manifest` | Manifest payload 缓存 entry | -示例: +示例查询: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'iceberg_ctl' - AND cache_name LIKE 'iceberg_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'iceberg' +ORDER BY entry_name; ``` ### 支持的 Iceberg 版本 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md index 418825065f6e7..9fcbc9b98277a 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md @@ -113,7 +113,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.1.x 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +本节说明 MaxCompute 相关 cache 模块的配置与观测方式。 统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 @@ -121,36 +122,38 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| -| `partition-values` | `meta.cache.maxcompute.partition-values.` | 分区值列表(减少重复的远端枚举开销)。 | +| `schema` | `meta.cache.maxcompute.schema.` | 表 schema 加载对应的 schema cache entry。 | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | 分区值缓存 entry,用于分区剪枝与分区枚举。 | -示例: +说明: -```sql -ALTER CATALOG mc_ctl SET PROPERTIES ( - "meta.cache.maxcompute.partition-values.ttl-second" = "3600", - "meta.cache.maxcompute.partition-values.capacity" = "5000" -); -``` +- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 +- `partition_values` 通过 `meta.cache.maxcompute.partition_values.*` 配置。 +- stats 表里能直接看到的 MaxCompute entry 只有 `partition_values` 和 `schema`。 +- `meta.cache.maxcompute.*` 目前没有专门的热生效 hook。 ### 可观测性 {#meta-cache-unified-observability} MaxCompute 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 -MaxCompute 模块对应的 `cache_name` 如下: +MaxCompute 常见 entry: -| 模块 | cache_name | +| Entry | 含义 | |---|---| -| `partition-values` | `maxcompute_partition_values_cache` | +| `schema` | Schema cache entry | +| `partition_values` | 分区值缓存 entry | -示例: +示例查询: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'mc_ctl' - AND cache_name LIKE 'maxcompute_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'maxcompute' +ORDER BY entry_name; ``` ### 支持的 MaxCompute 版本 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx index b210df53b32c4..d277f5ac1de00 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx @@ -92,7 +92,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ## 元数据缓存(4.1.x+) {#meta-cache-unified} -从 Doris 4.1.x 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。本节只介绍**如何使用**与**如何观测**。 +从 Doris 4.1.x 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +本节说明 Paimon 相关 cache 模块的配置与观测方式。 统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 @@ -100,9 +101,16 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | 模块 | 属性键前缀 | 典型缓存内容 | |---|---|---| +| `schema` | `meta.cache.paimon.schema.` | 表 schema 加载对应的 schema cache entry。 | | `table` | `meta.cache.paimon.table.` | Paimon 表元数据(用于查询规划,实际涉及 schema/snapshot/partition 等元数据加载)。 | -示例(关闭 module 缓存,按需实时加载): +说明: + +- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 +- `schema` 和 `table` 是两个独立 entry。若 catalog 未设置 `meta.cache.paimon.schema.*`,`schema` 使用 FE 默认值。 +- `ALTER CATALOG ... SET PROPERTIES` 的修改通过统一热生效路径应用。 + +示例: ```sql ALTER CATALOG paimon_ctl SET PROPERTIES ( @@ -115,20 +123,23 @@ ALTER CATALOG paimon_ctl SET PROPERTIES ( Paimon 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 -Paimon 模块对应的 `cache_name` 如下: +Paimon 常见 entry: -| 模块 | cache_name | +| Entry | 含义 | |---|---| -| `table` | `paimon_table_cache` | +| `schema` | Schema cache entry | +| `table` | 表元数据缓存 entry | -示例: +示例查询: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics WHERE catalog_name = 'paimon_ctl' - AND cache_name LIKE 'paimon_%' -ORDER BY cache_name, metric_name; + AND engine_name = 'paimon' +ORDER BY entry_name; ``` ### 支持的 Paimon 版本 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md index 26c6cf5dcbed2..51454fd769c6e 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md @@ -21,6 +21,13 @@ :::note 对于 Doris 4.1.x 及之后版本,外表元数据缓存已重构并使用统一配置键 `meta.cache.*`。 请参阅[统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 + +从 Doris 4.1.x 开始,外表元数据缓存可以分成两层来理解: + +- 通用 catalog 缓存:库/表名称列表、库/表对象等,仍由 `max_meta_object_cache_num`、`external_cache_refresh_time_minutes`、`external_cache_expire_time_seconds_after_access` 等 FE 配置控制。 +- 引擎特定 entry 缓存:schema、分区元数据、manifest、文件列表等,这些按 catalog 使用统一键 `meta.cache...{enable,ttl-second,capacity}` 配置。 + +统一文档主要描述第二层。 ::: ## 缓存策略 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md index 94dfc574c287b..a7cf6c755cebf 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md @@ -20,7 +20,7 @@ ## 统一属性模型 -各引擎缓存 module 使用统一的配置键格式: +各引擎 cache entry 使用统一的配置键格式: `meta.cache...{enable,ttl-second,capacity}` @@ -32,7 +32,12 @@ | `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭;`-1` 表示永不过期;其他值表示按访问时间计算 TTL。 | | `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | -示例(修改 catalog properties): +说明: + +- `` 使用 catalog 文档和 stats 表中展示的 cache entry 名,例如 `partition_values`、`fs_view`、`meta_client`。 +- 当前没有 per-entry 的刷新周期参数。异步刷新周期仍由 FE 配置 `external_cache_refresh_time_minutes` 统一控制。 + +示例: ```sql ALTER CATALOG hive_ctl SET PROPERTIES ( @@ -42,12 +47,29 @@ ALTER CATALOG hive_ctl SET PROPERTIES ( ## 外表 Meta Cache 覆盖范围 -外表元数据缓存覆盖多种元数据类型。其中一部分由统一 `meta.cache.*` 键配置,另一部分由 FE 配置控制: +这里有两层缓存,比较容易混淆: + +- Catalog 对象/名称缓存:如 `SHOW DATABASES`、`SHOW TABLES`、库对象、表对象等,见 [元数据缓存](../meta-cache.md)。 +- 引擎 entry 缓存:如 Hive 分区/文件、Iceberg manifest、Paimon table handle、schema entry 等。本文主要讲这一层。 + +外表元数据 cache entry 覆盖多种元数据类型。其中一部分由统一 `meta.cache.*` 键配置,另一部分同时继承 FE 级默认值: | 类别 | 示例 | 配置方式 | |---|---|---| -| 引擎 module 缓存 | Hive 分区/文件、Iceberg manifest、Paimon 表元数据等 | Catalog `PROPERTIES`:`meta.cache...*` | -| Schema cache | 表 schema(按版本 token 隔离) | FE 配置(例如:`max_external_schema_cache_num`) | +| 引擎 entry 缓存 | Hive `partition_values` / `partition` / `file`、Iceberg `manifest`、Paimon `table` 等 | Catalog `PROPERTIES`:`meta.cache...*` | +| Schema cache | 各引擎自己的 `schema` entry,按 schema version token 隔离 | FE 配置提供默认值,Catalog `meta.cache..schema.*` 可覆盖 | + +## 支持矩阵 + +下面的表总结了当前实现状态: + +| 引擎 | 在 stats 表里能看到的 entry | 属性键前缀 | `ALTER CATALOG ... SET PROPERTIES` 热生效 | +|---|---|---|---| +| Hive | `schema`、`partition_values`、`partition`、`file` | `meta.cache.hive..*` | `meta.cache.hive.*` 的变更不会通过统一热生效路径应用;需重建 catalog 或重启 FE 后生效 | +| Iceberg | `schema`、`table`、`view`、`manifest` | `meta.cache.iceberg..*` | 支持 | +| Paimon | `schema`、`table` | `meta.cache.paimon..*` | 支持 | +| Hudi | `schema`、`partition`、`fs_view`、`meta_client` | `meta.cache.hudi..*` | 支持,通过 HMS catalog 属性更新路径生效 | +| MaxCompute | `schema`、`partition_values` | `meta.cache.maxcompute..*` | 没有专门的热生效 hook | ## 各类 Catalog 的配置入口(链接) @@ -66,19 +88,24 @@ ALTER CATALOG hive_ctl SET PROPERTIES ( 通过系统表统一观测缓存指标: ```sql -SELECT * +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -ORDER BY catalog_name, cache_name, metric_name; +ORDER BY catalog_name, engine_name, entry_name; ``` 该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 -约定与常见指标: +可以这样理解这些字段: | 内容 | 说明 | |---|---| -| `cache_name` | `__cache`(module 中的 `-` 会被替换为 `_`) | -| 常见指标 | `hit_ratio`、`hit_count`、`read_count`、`eviction_count`、`average_load_penalty`、`estimated_size` | +| `ENGINE_NAME` | 缓存引擎,如 `hive`、`iceberg` | +| `ENTRY_NAME` | 该引擎下的精确 entry 名,如 `partition_values`、`fs_view`、`manifest` | +| `EFFECTIVE_ENABLED` | 综合 `enable`、`ttl-second`、`capacity` 后最终是否生效 | + +常见查询方式是按 `catalog_name` 和 `engine_name` 过滤。该系统表不再使用旧的 `cache_name` / `metric_name` 透视模型。 ## 旧参数迁移说明 From c8557f38735eb70ef8700b3c7f64b7b353444ff7 Mon Sep 17 00:00:00 2001 From: Socrates Date: Tue, 10 Mar 2026 16:37:11 +0800 Subject: [PATCH 4/8] docs: refine meta cache and refresh docs --- docs/lakehouse/meta-cache.md | 108 ++++++++++------- .../meta-cache/unified-meta-cache.md | 88 +++++++------- .../sql-statements/catalog/REFRESH.md | 21 +++- .../current/lakehouse/meta-cache.md | 112 ++++++++++-------- .../meta-cache/unified-meta-cache.md | 84 +++++++------ .../sql-statements/catalog/REFRESH.md | 21 +++- 6 files changed, 244 insertions(+), 190 deletions(-) diff --git a/docs/lakehouse/meta-cache.md b/docs/lakehouse/meta-cache.md index 9a11f4aa84135..a1dbca3ab662f 100644 --- a/docs/lakehouse/meta-cache.md +++ b/docs/lakehouse/meta-cache.md @@ -30,6 +30,9 @@ Starting from Doris 4.1.x, external metadata caching can be understood as two la The unified document focuses on the second layer. ::: +This page mainly records FE-level defaults and legacy catalog properties used by the 2.1.x / 3.x cache model. +For the current engine-specific cache entry matrix in Doris 4.1.x+, use the unified page and the catalog-specific pages. + ## Cache Strategies Most caches have the following three strategy indicators: @@ -64,6 +67,19 @@ Most caches have the following three strategy indicators: ## Cache Types +The following sections describe representative FE-level defaults and legacy cache controls. +They should not be read as the complete cache entry matrix for Doris 4.1.x+. + +| Category | Scope | Main FE defaults | Notes | +|---|---|---|---| +| Database / table name lists | Per catalog / per database | `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Used by `SHOW DATABASES` / `SHOW TABLES` | +| Database / table objects | Per catalog / per database | `max_meta_object_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Object cache can diverge temporarily from name-list cache | +| Table schema | Per catalog | `max_external_schema_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Legacy per-catalog override: `schema.cache.ttl-second` | +| Hive partition values | Per Hive catalog | `max_hive_partition_table_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Legacy per-catalog override: `partition.cache.ttl-second` | +| Hive partition properties | Per Hive catalog | `max_hive_partition_cache_num`, `external_cache_expire_time_seconds_after_access` | No legacy per-catalog TTL override | +| Hive file lists | Per Hive catalog | `max_external_file_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Legacy per-catalog override: `file.meta.cache.ttl-second` | +| Hudi / Iceberg / Paimon legacy table-level metadata | Per catalog | `max_external_table_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | For Doris 4.1.x+, use the catalog pages for current cache entries such as `fs_view`, `meta_client`, `view`, and `manifest` | + ### Database and Table Name Lists The database name list refers to the list of all database names under a Catalog. @@ -96,7 +112,7 @@ Note that the list of objects in this cache may be inconsistent with the **datab For example, through the `SHOW TABLES` command, you get tables `A`, `B`, and `C` from the name list cache. Suppose table `D` is added to the external data source at this time, then `SELECT * FROM D` can access table `D`, and the [table object] cache will add the table `D` object, but the [table name list] cache may still be `A`, `B`, `C`. Only when the [table name list] cache is refreshed will it become `A`, `B`, `C`, `D`. -Each Catalog has a database name list cache. Each database has a table name list cache. +Each Catalog has a database object cache. Each database has a table object cache. - Maximum cache count @@ -116,7 +132,7 @@ Each Catalog has a database name list cache. Each database has a table name list Caches the schema information of tables, such as column names. This cache is mainly used to load the schema of accessed tables on demand, to prevent synchronizing a large number of unnecessary table schemas and occupying FE memory. -This cache is shared by all Catalogs and is globally unique. +This cache is managed per catalog. - Maximum cache count @@ -184,7 +200,7 @@ Used to cache the file list information under a single partition of a Hive table - Maximum cache count - Controlled by the FE configuration item `max_external_file_cache_num`, default is 100000. + Controlled by the FE configuration item `max_external_file_cache_num`, default is 10000. You can adjust this parameter appropriately according to the number of files to be accessed. @@ -198,13 +214,14 @@ Used to cache the file list information under a single partition of a Hive table - Minimum refresh time - Controlled by the FE configuration item `external_cache_expire_time_minutes_after_access`, in minutes. Default is 10 minutes. Reducing this time allows you to see the latest partition properties in Doris more in real time, but increases the frequency of accessing external data sources. + Controlled by the FE configuration item `external_cache_expire_time_minutes_after_access`, in minutes. Default is 10 minutes. Reducing this time allows you to see the latest file list in Doris more in real time, but increases the frequency of accessing external data sources. After version 3.0.7, the configuration item name is changed to `external_cache_refresh_time_minutes`. The default value remains unchanged. ### Hudi Table Partitions -Used to cache partition information of Hudi tables. +Legacy summary of Hudi partition metadata caching. +Current Hudi cache entries in Doris 4.1.x+ also include `fs_view` and `meta_client`; see [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache-unified). This cache, each Hudi Catalog has one. @@ -226,7 +243,8 @@ This cache, each Hudi Catalog has one. ### Iceberg Table Information -Used to cache Iceberg table objects. The object is loaded and constructed through the Iceberg API. +Legacy summary of Iceberg table metadata caching. The table object is loaded and constructed through the Iceberg API. +For Doris 4.1.x+, the current observable cache entries are documented in [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache-unified). This cache, each Iceberg Catalog has one. @@ -246,10 +264,10 @@ This cache, each Iceberg Catalog has one. After version 3.0.7, the configuration item name is changed to `external_cache_refresh_time_minutes`. The default value remains unchanged. -### Iceberg Table Snapshot +### Iceberg Snapshot-Related Metadata -Used to cache the snapshot list of Iceberg tables. The object is loaded and constructed through the Iceberg API. -This cache, each Iceberg Catalog has one. +Legacy summary of snapshot-related metadata derived from Iceberg table metadata. +In current implementations, this should not be read as a separate 4.1.x cache entry alongside `table`, `view`, or `manifest`. - Maximum cache count @@ -269,37 +287,19 @@ This cache, each Iceberg Catalog has one. ## Cache Refresh -In addition to the refresh and eviction strategies of each cache above, users can also directly refresh the metadata cache manually or on a schedule. +In addition to the refresh and eviction strategies above, users can also refresh metadata manually or on a schedule. ### Manual Refresh -Users can manually refresh metadata using the `REFRESH` command. - -1. REFRESH CATALOG - - Refresh the specified Catalog. - - `REFRESH CATALOG ctl1 PROPERTIES("invalid_cache" = "true");` - - - This command refreshes the database list, table column names, and all cache information of the specified Catalog. - - `invalid_cache` indicates whether to refresh caches such as partitions and file lists. The default is true. If false, only the database and table lists of the Catalog will be refreshed, but not caches such as partitions and file lists. This parameter is suitable for cases where the user only wants to synchronize newly added or deleted databases and tables, and can be set to false. - -2. REFRESH DATABASE - - Refresh the specified Database. - - `REFRESH DATABASE [ctl.]db1 PROPERTIES("invalid_cache" = "true");` - - - This command refreshes the table column names and all cache information under the specified Database. - - The meaning of the `invalid_cache` property is the same as above. The default is true. If false, only the table list of the Database will be refreshed, but not the cache information. This parameter is suitable for cases where the user only wants to synchronize newly added or deleted tables. +Use the `REFRESH` statement to invalidate catalog, database, or table metadata. +For current syntax, privileges, and examples, see [REFRESH](../sql-manual/sql-statements/catalog/REFRESH.md). -3. REFRESH TABLE +Behavior summary: - Refresh the specified Table. - - `REFRESH TABLE [ctl.][db.]tbl1;` - - - This command refreshes all cache information under the specified Table. +- `REFRESH CATALOG` invalidates catalog-level object caches and, by default, lower-level metadata caches. +- `REFRESH DATABASE` invalidates metadata under one database. +- `REFRESH TABLE` invalidates metadata for one table. +- For `REFRESH CATALOG`, `invalid_cache = false` keeps lower-level caches and refreshes only object/name lists. ### Scheduled Refresh @@ -315,7 +315,7 @@ CREATE CATALOG hive PROPERTIES ( In the above example, `metadata_refresh_interval_sec` means the Catalog is refreshed every 3600 seconds. This is equivalent to automatically executing once every 3600 seconds: -`REFRESH CATALOG ctl1 PROPERTIES("invalid_cache" = "true");` +`REFRESH CATALOG ctl1;` ## Best Practices @@ -334,9 +334,8 @@ This section mainly introduces the cache behavior that users may be concerned ab For all types of External Catalogs, if you want to see the latest Table Schema in real time, you can disable the Schema Cache: :::note -Starting from Doris 4.1.x, the legacy catalog-level cache property `schema.cache.ttl-second` is deprecated. -For 4.1.x+, keep using the FE config method below, and refer to: -[Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). +For Doris 4.1.x+, prefer the unified per-catalog property `meta.cache..schema.ttl-second = "0"`. +See [Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). ::: - Disable globally @@ -346,11 +345,18 @@ For 4.1.x+, keep using the FE config method below, and refer to: max_external_schema_cache_num=0 // Disable Schema cache. ``` -- Disable at Catalog level +- Disable at Catalog level in Doris 4.1.x+ + + ```text + -- Catalog property + "meta.cache..schema.ttl-second" = "0" + ``` + +- Legacy catalog-level property ```text -- Catalog property - "schema.cache.ttl-second" = "0" // For a specific Catalog, disable Schema cache (supported in 2.1.11, 3.0.6) + "schema.cache.ttl-second" = "0" // Legacy property, supported in 2.1.11 / 3.0.6 ``` After setting, Doris will see the latest Table Schema in real time. However, this setting may increase the pressure on the metadata service. @@ -360,8 +366,7 @@ After setting, Doris will see the latest Table Schema in real time. However, thi For Hive Catalog, if you want to disable the cache to query real-time updated data, you can configure the following parameters: :::note -Starting from Doris 4.1.x, the legacy catalog-level properties `file.meta.cache.ttl-second` and `partition.cache.ttl-second` -are deprecated. Use unified `meta.cache.hive.*` properties instead. See: +For Doris 4.1.x+, prefer unified `meta.cache.hive.*` properties. See: [Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-unified) and [Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). ::: @@ -372,19 +377,30 @@ are deprecated. Use unified `meta.cache.hive.*` properties instead. See: -- fe.conf max_external_file_cache_num=0 // Disable file list cache max_hive_partition_table_cache_num=0 // Disable partition list cache + max_hive_partition_cache_num=0 // Disable partition property cache + ``` + +- Disable at Catalog level in Doris 4.1.x+ + + ```text + -- Catalog property + "meta.cache.hive.partition_values.ttl-second" = "0" // Disable partition list cache + "meta.cache.hive.partition.ttl-second" = "0" // Disable partition property cache + "meta.cache.hive.file.ttl-second" = "0" // Disable file list cache ``` -- Disable at Catalog level +- Legacy catalog-level properties ```text -- Catalog property - "file.meta.cache.ttl-second" = "0" // For a specific Catalog, disable file list cache - "partition.cache.ttl-second" = "0" // For a specific Catalog, disable partition list cache (supported in 2.1.11, 3.0.6) + "file.meta.cache.ttl-second" = "0" // Disable file list cache + "partition.cache.ttl-second" = "0" // Disable partition list cache (supported in 2.1.11 / 3.0.6) ``` After setting the above parameters: - New partitions in the external data source can be queried in real time. - Changes in partition data files can be queried in real time. +- Changes in partition properties require disabling the partition property cache as well. But this will increase the access pressure on external data sources (such as Hive Metastore and HDFS), which may cause unstable metadata access latency and other phenomena. diff --git a/docs/lakehouse/meta-cache/unified-meta-cache.md b/docs/lakehouse/meta-cache/unified-meta-cache.md index 770ec24fb4198..97a9e7c4ee536 100644 --- a/docs/lakehouse/meta-cache/unified-meta-cache.md +++ b/docs/lakehouse/meta-cache/unified-meta-cache.md @@ -6,82 +6,78 @@ } --- -Starting from **Doris 4.1.x**, external metadata caching is unified for major External Catalog engines. As a user, you only need to know: +Starting from **Doris 4.1.x**, external metadata caching is unified for major External Catalog engines. The unified cache standardizes configuration models and monitoring metrics across different data lake engines (like Hive, Iceberg, etc.), reducing the configuration threshold and troubleshooting difficulty for multi-source data management. -| You want to know | Where in docs | -|---|---| -| Where to configure | Catalog `PROPERTIES` with `meta.cache.*` keys (see the catalog pages linked below). | -| What it affects | Depends on catalog engine (partitions, file listing, table metadata, manifests, etc.). | -| How to observe | `information_schema.catalog_meta_cache_statistics` (see the observability section below). | +As a user, you mainly need to care about three things: + +- **What it affects:** Depends on the catalog engine (partitions, file listing, table metadata, manifests, etc.). +- **Where to configure:** Catalog `PROPERTIES` with unified `meta.cache.*` keys (see the catalog pages linked below). +- **How to observe:** `information_schema.catalog_meta_cache_statistics` system table (see the observability section below). :::tip Applies to Doris 4.1.x and later. ::: +## What External Meta Cache Includes + +Before configuring, it's important to understand what is actually being cached. There are two layers of metadata caching that are easy to confuse: + +- **Catalog object/name caches:** `SHOW DATABASES`, `SHOW TABLES`, database objects, table objects, and related generic caches described in [Metadata Cache](../meta-cache.md). +- **Engine entry caches:** Engine-specific runtime metadata such as Hive partitions/files, Iceberg manifests, Paimon table handles, and schema entries. This page focuses on this layer. + +External meta cache entries cover different kinds of metadata. Some are configured by unified catalog properties, and some also inherit FE-level defaults: + +| Category | Examples | How to configure | +|---|---|---| +| Engine entry caches | Hive `partition_values` / `partition` / `file`, Iceberg `manifest`, Paimon `table`, etc. | Catalog `PROPERTIES`: `meta.cache...*` | +| Schema cache | Per-engine `schema` entry, isolated by schema version token | FE configs provide defaults; catalog `meta.cache..schema.*` can override them | + ## Unified Property Model All engine cache entries share the same property key pattern: -`meta.cache...{enable,ttl-second,capacity}` +`meta.cache...{enable,ttl-second,capacity}` The following table describes the property semantics: | Property | Example | Meaning | |---|---|---| -| `enable` | `true/false` | Whether this cache module is enabled. | -| `ttl-second` | `600`, `0`, `-1` | `0` disables the module; `-1` means no expiration; otherwise expire after access by TTL. | -| `capacity` | `10000` | Max entry count (count-based). `0` disables the module. | +| `enable` | `true/false` | Whether this cache entry is enabled. | +| `ttl-second` | `600`, `0`, `-1` | `0` disables the entry; `-1` means no expiration; otherwise expire after access by TTL. | +| `capacity` | `10000` | Max entry count (count-based). `0` disables the entry. | + +**Note on Effective State:** +Only when `enable=true` AND `ttl-second > 0` (or `-1`) AND `capacity > 0`, the cache entry will be truly effective (corresponding to `EFFECTIVE_ENABLED = true` in the observability table). Notes: -- `` uses the cache entry name shown in the catalog documentation and the stats table, for example `partition_values`, `fs_view`, `meta_client`. +- `` uses the cache entry name shown in the catalog documentation and the stats table, for example `partition_values`, `fs_view`, `meta_client`. - There is currently no per-entry refresh interval property. Async refresh behavior still uses the FE config `external_cache_refresh_time_minutes`. Example: ```sql ALTER CATALOG hive_ctl SET PROPERTIES ( + -- Set the TTL of Hive file cache to 0, which immediately disables this cache entry "meta.cache.hive.file.ttl-second" = "0" ); ``` -## What External Meta Cache Includes - -There are two layers of metadata caching that are easy to confuse: - -- Catalog object/name caches: `SHOW DATABASES`, `SHOW TABLES`, database objects, table objects, and related generic caches described in [Metadata Cache](../meta-cache.md). -- Engine entry caches: engine-specific runtime metadata such as Hive partitions/files, Iceberg manifests, Paimon table handles, and schema entries. This page focuses on this layer. - -External meta cache entries cover different kinds of metadata. Some are configured by unified catalog properties, and some also inherit FE-level defaults: - -| Category | Examples | How to configure | -|---|---|---| -| Engine entry caches | Hive `partition_values` / `partition` / `file`, Iceberg `manifest`, Paimon `table`, etc. | Catalog `PROPERTIES`: `meta.cache...*` | -| Schema cache | Per-engine `schema` entry, isolated by schema version token | FE configs provide defaults; catalog `meta.cache..schema.*` can override them | - -## Support Matrix +## Supported Engines and Configurations -The following table summarizes the current implementation: +The following table summarizes the current implementation and links to catalog-specific configuration pages: -| Engine | Entries you will see in stats | Property key prefix | `ALTER CATALOG ... SET PROPERTIES` hot-reload | +| Catalog Engine | Entries you will see in stats (``) | `ALTER CATALOG ... SET PROPERTIES` hot-reload | Detailed Configuration Guide | |---|---|---|---| -| Hive | `schema`, `partition_values`, `partition`, `file` | `meta.cache.hive..*` | Changes to `meta.cache.hive.*` are not applied through the unified hot-reload path; recreate the catalog or restart FE to apply new specs | -| Iceberg | `schema`, `table`, `view`, `manifest` | `meta.cache.iceberg..*` | Supported | -| Paimon | `schema`, `table` | `meta.cache.paimon..*` | Supported | -| Hudi | `schema`, `partition`, `fs_view`, `meta_client` | `meta.cache.hudi..*` | Supported through HMS catalog property updates | -| MaxCompute | `schema`, `partition_values` | `meta.cache.maxcompute..*` | No dedicated hot-reload hook | - -## Catalog-Specific Configuration (Links) - -For each catalog engine, the supported cache modules and the recommended properties are documented in its catalog page: - -| Catalog engine | Where to configure module caches | -|---|---| -| Hive | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-unified) | -| Iceberg | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-unified) | -| Paimon | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-unified) | -| Hudi | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-unified) | -| MaxCompute | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-unified) | +| Hive | `schema`, `partition_values`, `partition`, `file` | Changes to `meta.cache.hive.*` are not applied through the unified hot-reload path; recreate the catalog or restart FE to apply new specs | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-unified) | +| Iceberg | `schema`, `table`, `view`, `manifest` | Supported | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-unified) | +| Paimon | `schema`, `table` | Supported | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-unified) | +| Hudi | `schema`, `partition`, `fs_view`, `meta_client` | Supported through HMS catalog property updates | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-unified) | +| MaxCompute | `schema`, `partition_values` | No dedicated hot-reload hook | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-unified) | + +:::caution +For **Hive Catalogs**, changes to `meta.cache.hive.*` properties via `ALTER CATALOG` do **not** take effect dynamically. You must recreate the catalog or restart the Frontend (FE) node to apply the new configurations. +::: ## Observability @@ -104,6 +100,8 @@ Read the table as follows: | `ENGINE_NAME` | Cache engine, such as `hive` or `iceberg` | | `ENTRY_NAME` | Exact entry name used by that engine, such as `partition_values`, `fs_view`, `manifest` | | `EFFECTIVE_ENABLED` | Final enable state after evaluating `enable`, `ttl-second`, and `capacity` | +| `LOAD_FAILURE_COUNT` | Number of failed loads from external systems. Useful for identifying upstream metadata service issues. | +| `LAST_ERROR` | The exception message of the last failed load. Highly valuable for troubleshooting timeout or connection errors with HMS, S3, etc. | Common queries filter by `catalog_name` and `engine_name`. This table no longer uses the old `cache_name` / `metric_name` pivoted model. diff --git a/docs/sql-manual/sql-statements/catalog/REFRESH.md b/docs/sql-manual/sql-statements/catalog/REFRESH.md index 7c7b520211363..fcf2c613560bc 100644 --- a/docs/sql-manual/sql-statements/catalog/REFRESH.md +++ b/docs/sql-manual/sql-statements/catalog/REFRESH.md @@ -13,7 +13,8 @@ This statement refreshes the metadata of the specified Catalog/Database/Table. ## Syntax ```sql -REFRESH CATALOG ; +REFRESH CATALOG + [PROPERTIES ("invalid_cache" = "true" | "false")]; REFRESH DATABASE [.]; REFRESH TABLE [[.].]; ``` @@ -41,6 +42,13 @@ The name of the table within the catalog that needs to be refreshed. ## Usage Notes When the Catalog is refreshed, the object-related Cache is forced to be invalidated. Including Partition Cache, Schema Cache, File Cache, etc. +`invalid_cache` controls whether lower-level metadata caches are invalidated during `REFRESH CATALOG`: + +- `true`: invalidate catalog object caches and lower-level caches such as partition, schema, and file caches. This is the default behavior. +- `false`: refresh only catalog-level object/name metadata and keep lower-level caches. + +`invalid_cache` currently applies to `REFRESH CATALOG`. + ## Example 1. Refresh hive catalog @@ -49,14 +57,20 @@ When the Catalog is refreshed, the object-related Cache is forced to be invalida REFRESH CATALOG hive; ``` -2. Refresh database1 +2. Refresh hive catalog without invalidating lower-level caches + + ```sql + REFRESH CATALOG hive PROPERTIES("invalid_cache" = "false"); + ``` + +3. Refresh database1 ```sql REFRESH DATABASE ctl.database1; REFRESH DATABASE database1; ``` -3. Refresh table1 +4. Refresh table1 ```sql REFRESH TABLE ctl.db.table1; @@ -64,4 +78,3 @@ When the Catalog is refreshed, the object-related Cache is forced to be invalida REFRESH TABLE table1; ``` - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md index 51454fd769c6e..506f5f3031833 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md @@ -30,6 +30,9 @@ 统一文档主要描述第二层。 ::: +本文主体主要记录 2.1.x / 3.x 旧缓存模型中的 FE 默认值与兼容参数。 +对于 Doris 4.1.x+ 的当前引擎级 cache entry,请直接阅读统一页和各 Catalog 文档。 + ## 缓存策略 大多数缓存都有如下三个策略指标: @@ -64,15 +67,27 @@ ## 缓存类型 +下面的内容主要描述代表性的 FE 默认值与旧模型兼容参数,不应理解为 Doris 4.1.x+ 的完整 cache entry 列表。 + +| 类别 | 作用域 | 主要 FE 默认值 | 说明 | +|---|---|---|---| +| 库 / 表名称列表 | 每个 catalog / 每个 database | `external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 用于 `SHOW DATABASES` / `SHOW TABLES` | +| 库 / 表对象 | 每个 catalog / 每个 database | `max_meta_object_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 对象缓存与名称列表缓存可能短暂不一致 | +| 表 schema | 每个 catalog | `max_external_schema_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 旧的 catalog 级兼容参数:`schema.cache.ttl-second` | +| Hive 分区值 | 每个 Hive catalog | `max_hive_partition_table_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 旧的 catalog 级兼容参数:`partition.cache.ttl-second` | +| Hive 分区属性 | 每个 Hive catalog | `max_hive_partition_cache_num`、`external_cache_expire_time_seconds_after_access` | 没有旧的 catalog 级 TTL 覆盖参数 | +| Hive 文件列表 | 每个 Hive catalog | `max_external_file_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 旧的 catalog 级兼容参数:`file.meta.cache.ttl-second` | +| Hudi / Iceberg / Paimon 旧表级元数据 | 每个 catalog | `max_external_table_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | Doris 4.1.x+ 下的 `fs_view`、`meta_client`、`view`、`manifest` 等请看对应 Catalog 页 | + ### 库、表名称列表 库名称列表(Database name list)指的是一个 Catalog 下所有库的名称的列表。 表名称列表(Table name list)指的是一个库下所有表的名称列表。 -名称列表仅用于需要列举名称得操作,如 `SHOW TABLES` 或 `SHOW DATABASES` 语句。 +名称列表仅用于需要列举名称的操作,如 `SHOW TABLES` 或 `SHOW DATABASES` 语句。 -每个 Catalog 下都一个库名称列表缓存。每个库下都有一个表名称列表缓存。 +每个 Catalog 下都有一个库名称列表缓存。每个库下都有一个表名称列表缓存。 - 最大缓存数量 @@ -96,7 +111,7 @@ 比如通过 `SHOW TABLES` 命令,从名称列表缓存中获取到 `A`、`B`、`C` 三个表。假设此时外部数据源增加了表 `D`,那么 `SELECT * FROM D` 可以访问到表 `D`,同时【表对象】缓存里会增加表 `D` 对象,但【表名称列表】缓存中可能依然是 `A`、`B`、`C`。只有当【表名称列表】缓存刷新后,才会变成 `A`、`B`、`C`、`D`。 -每个 Catalog 下都一个库名称列表缓存。每个库下都有一个表名称列表缓存。 +每个 Catalog 下都有一个库对象缓存。每个库下都有一个表对象缓存。 - 最大缓存数量 @@ -108,7 +123,7 @@ - 最短刷新时间 - 由 FE 配置项 `external_cache_expire_time_minutes_after_access` 控制。单位为分钟。默认 10 分钟。减少该时间,可以更实时的在 Doris 中到最新的库或表,但会增加访问外部数据源的频率。 + 由 FE 配置项 `external_cache_expire_time_minutes_after_access` 控制。单位为分钟。默认 10 分钟。减少该时间,可以更实时地在 Doris 中看到最新的库或表,但会增加访问外部数据源的频率。 3.0.7 版本后,配置项名称修改为 `external_cache_refresh_time_minutes`。默认值不变。 @@ -116,7 +131,7 @@ 缓存表的 Schema 信息,如列名等。该缓存主要用于按需加载被访问到的表的 Schema,以防止同步大量不需要被访问的表的 Schema 而占用 FE 的内存。 -该缓存由所有 Catalog 共享,全局唯一。 +该缓存按 catalog 维度管理。 - 最大缓存数量 @@ -184,7 +199,7 @@ - 最大缓存数量 - 由 FE 配置项 `max_external_file_cache_num` 控制,默认为 100000。 + 由 FE 配置项 `max_external_file_cache_num` 控制,默认为 10000。 可以根据所需要访问的文件数量,适当调整这个参数。 @@ -198,13 +213,14 @@ - 最短刷新时间 - 由 FE 配置项 `external_cache_expire_time_minutes_after_access` 控制。单位为分钟。默认 10 分钟。减少该时间,可以更实时的在 Doris 中访问到最新的分区属性,但会增加访问外部数据源的频率。 + 由 FE 配置项 `external_cache_expire_time_minutes_after_access` 控制。单位为分钟。默认 10 分钟。减少该时间,可以更实时地在 Doris 中访问到最新的文件列表,但会增加访问外部数据源的频率。 3.0.7 版本后,配置项名称修改为 `external_cache_refresh_time_minutes`。默认值不变。 ### Hudi 表分区 -用于缓存 Hudi 表的分区信息。 +这里描述的是 Hudi 分区元数据缓存的旧模型摘要。 +对于 Doris 4.1.x+ 的当前 Hudi cache entry(如 `fs_view`、`meta_client`),请参阅 [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache-unified)。 该缓存,每个 Hudi Catalog 有一个。 @@ -226,7 +242,8 @@ ### Iceberg 表信息 -用于缓存 Iceberg 表对象。该对象通过 Iceberg API 加载并构建。 +这里描述的是 Iceberg 表元数据缓存的旧模型摘要。表对象通过 Iceberg API 加载并构建。 +对于 Doris 4.1.x+ 的当前可观测 cache entry,请参阅 [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache-unified)。 该缓存,每个 Iceberg Catalog 有一个。 @@ -246,10 +263,10 @@ 3.0.7 版本后,配置项名称修改为 `external_cache_refresh_time_minutes`。默认值不变。 -### Iceberg 表 Snapshot +### Iceberg Snapshot 相关元数据 -用于缓存 Iceberg 表的 Snapshot 列表。该对象通过 Iceberg API 加载并构建。 -该缓存,每个 Iceberg Catalog 有一个。 +这里描述的是从 Iceberg 表元数据派生出的 snapshot 相关缓存行为。 +在当前实现里,不应将它理解为 Doris 4.1.x 下和 `table`、`view`、`manifest` 并列的独立 cache entry。 - 最大缓存数量 @@ -269,37 +286,19 @@ ## 缓存刷新 -除了上述每个缓存各自的刷新和淘汰策略外,用户也可以通过手动或定时的方式直接刷新元数据缓存。 +除了上述刷新和淘汰策略外,用户也可以通过手动或定时方式刷新元数据。 ### 手动刷新 -用户可以通过 `REFRESH` 命令手动刷新元数据。 - -1. REFRESH CATALOG - - 刷新指定 Catalog。 - - `REFRESH CATALOG ctl1 PROPERTIES("invalid_cache" = "true");` - - - 该命令会刷新指定 Catalog 的库列表,表列名以及所有缓存信息等。 - - `invalid_cache` 表示是否要刷新分区和文件列表等缓存。默认为 true。如果为 false,则只会刷新 Catalog 的库、表列表,而不会刷新分区和文件列表等缓存信息。该参数适用于,用户只想同步新增删的库表信息时,可以设置为 false。 - -2. REFRESH DATABASE - - 刷新指定 Database。 - - `REFRESH DATABASE [ctl.]db1 PROPERTIES("invalid_cache" = "true");` - - - 该命令会刷新指定 Database 的表列名以及 Database 下的所有缓存信息等。 - - `invalid_cache` 属性含义同上。默认为 true。如果为 false,则只会刷新 Database 的表列表,而不会刷新缓存信息。该参数适用于,用户只想同步新增删的表信息时。 +使用 `REFRESH` 语句可以失效 catalog、database 或 table 级元数据。 +当前语法、权限要求与示例请参阅 [REFRESH](../sql-manual/sql-statements/catalog/REFRESH.md)。 -3. REFRESH TABLE +行为摘要: - 刷新指定 Table。 - - `REFRESH TABLE [ctl.][db.]tbl1;` - - - 该命令会刷新指定 Table 下的所有缓存信息等。 +- `REFRESH CATALOG` 会刷新 catalog 级对象缓存,并默认继续失效更细粒度的元数据缓存。 +- `REFRESH DATABASE` 会刷新一个 database 下的元数据。 +- `REFRESH TABLE` 会刷新单表元数据。 +- 对 `REFRESH CATALOG`,若设置 `invalid_cache = false`,则只刷新对象/名称列表,不继续失效更细粒度缓存。 ### 定时刷新 @@ -315,7 +314,7 @@ CREATE CATALOG hive PROPERTIES ( 在上例中,`metadata_refresh_interval_sec` 表示每 3600 秒刷新一次 Catalog。相当于每隔 3600 秒,自动执行一次: -`REFRESH CATALOG ctl1 PROPERTIES("invalid_cache" = "true");` +`REFRESH CATALOG ctl1;` ## 最佳实践 @@ -334,8 +333,8 @@ CREATE CATALOG hive PROPERTIES ( 对于所有类型的 External Catalog,如果希望实时可见最新的 Table Schema,可以关闭 Schema Cache: :::note -从 Doris 4.1.x 开始,旧的 catalog 级缓存参数 `schema.cache.ttl-second` 已不再推荐使用。 -对于 4.1.x+,仍可使用下面的 FE 配置方式进行全局控制,并参考: +对于 Doris 4.1.x+,推荐使用统一键 `meta.cache..schema.ttl-second = "0"`。 +详细说明请参阅: [统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 ::: @@ -346,11 +345,18 @@ CREATE CATALOG hive PROPERTIES ( max_external_schema_cache_num=0 // 关闭 Schema 缓存。 ``` -- Catalog 级别关闭 +- Doris 4.1.x+ 的 catalog 级关闭方式 + + ```text + -- Catalog property + "meta.cache..schema.ttl-second" = "0" + ``` + +- 旧的 catalog 级兼容参数 ```text -- Catalog property - "schema.cache.ttl-second" = "0" // 针对某个 Catalog,关闭 Schema 缓存(2.1.11, 3.0.6 支持) + "schema.cache.ttl-second" = "0" // 旧参数,2.1.11 / 3.0.6 支持 ``` 设置完成后,Doris 会实时可见最新的 Table Schema。但此设置可能会增加元数据服务的压力。 @@ -360,8 +366,7 @@ CREATE CATALOG hive PROPERTIES ( 针对 Hive Catalog,如果想关闭缓存来查询到实时更新的数据,可以配置以下参数: :::note -从 Doris 4.1.x 开始,旧的 catalog 级参数 `file.meta.cache.ttl-second` 和 `partition.cache.ttl-second` -已不再推荐使用。请改用统一键 `meta.cache.hive.*`,并参考: +对于 Doris 4.1.x+,推荐优先使用统一键 `meta.cache.hive.*`,并参考: [Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-unified) 与 [统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 ::: @@ -372,19 +377,30 @@ CREATE CATALOG hive PROPERTIES ( -- fe.conf max_external_file_cache_num=0 // 关闭文件列表缓存 max_hive_partition_table_cache_num=0 // 关闭分区列表缓存 + max_hive_partition_cache_num=0 // 关闭分区属性缓存 + ``` + +- Doris 4.1.x+ 的 catalog 级关闭方式 + + ```text + -- Catalog property + "meta.cache.hive.partition_values.ttl-second" = "0" // 关闭分区列表缓存 + "meta.cache.hive.partition.ttl-second" = "0" // 关闭分区属性缓存 + "meta.cache.hive.file.ttl-second" = "0" // 关闭文件列表缓存 ``` -- Catalog 级别关闭 +- 旧的 catalog 级兼容参数 ```text -- Catalog property - "file.meta.cache.ttl-second" = "0" // 针对某个 Catalog,关闭文件列表缓存 - "partition.cache.ttl-second" = "0" // 针对某个 Catalog,关闭分区列表缓存(2.1.11, 3.0.6 支持) + "file.meta.cache.ttl-second" = "0" // 关闭文件列表缓存 + "partition.cache.ttl-second" = "0" // 关闭分区列表缓存(2.1.11 / 3.0.6 支持) ``` 设置以上参数后: - 外部数据源新增分区可以实时查询到。 - 分区数据文件变动可以实时查询到。 +- 如果希望实时看到分区属性变化,也需要同时关闭分区属性缓存。 但会增加外部源数据(如 Hive Metastore 和 HDFS)的访问压力,可能导致元数据访问延迟不稳定等现象。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md index a7cf6c755cebf..a377924c48143 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md @@ -6,82 +6,78 @@ } --- -从 **Doris 4.1.x** 开始,External Catalog 的外表元数据缓存能力进行了统一化重构。对用户来说,主要关注三件事: +从 **Doris 4.1.x** 开始,External Catalog 的外表元数据缓存能力进行了统一化重构。统一外表元数据缓存标准化了不同数据湖引擎(如 Hive, Iceberg 等)的缓存配置模型与监控体系,降低了多数据源管理的配置门槛和排障难度。 -| 你需要关心的问题 | 对应入口 | -|---|---| -| 在哪里配置 | 在 Catalog `PROPERTIES` 里使用统一键 `meta.cache.*`(具体 module 见下方各 catalog 文档)。 | -| 影响哪些内容 | 取决于不同 catalog 引擎(分区信息、文件列表、表元数据、manifest 等)。 | -| 如何观测 | 通过 `information_schema.catalog_meta_cache_statistics` 查看指标(见本文观测章节)。 | +对用户来说,主要关注三件事: + +- **影响哪些内容**:取决于不同 catalog 引擎(分区信息、文件列表、表元数据、manifest 等)。 +- **在哪里配置**:在 Catalog `PROPERTIES` 里使用统一键 `meta.cache.*`(具体 entry 见下方各 catalog 文档)。 +- **如何观测**:通过 `information_schema.catalog_meta_cache_statistics` 系统表查看指标(见本文观测章节)。 :::tip 适用于 Doris 4.1.x 及之后版本。 ::: +## 外表 Meta Cache 覆盖范围 + +在学习如何配置之前,首先需要明确该缓存覆盖的范围。这里有两层缓存比较容易混淆: + +- **Catalog 对象/名称缓存**:如 `SHOW DATABASES`、`SHOW TABLES`、库对象、表对象等,这部分属于通用缓存,详情见 [元数据缓存](../meta-cache.md)。 +- **引擎 entry 缓存**:各引擎特有的运行时元数据,如 Hive 分区/文件、Iceberg manifest、Paimon table handle、schema entry 等。**本文主要针对这一层进行说明。** + +外表元数据 cache entry 覆盖多种元数据类型。其中一部分由统一 `meta.cache.*` 键配置,另一部分同时继承 FE 级默认值: + +| 类别 | 示例 | 配置方式 | +|---|---|---| +| 引擎 entry 缓存 | Hive `partition_values` / `partition` / `file`、Iceberg `manifest`、Paimon `table` 等 | Catalog `PROPERTIES`:`meta.cache...*` | +| Schema cache | 各引擎自己的 `schema` entry,按 schema version token 隔离 | FE 配置提供默认值,Catalog `meta.cache..schema.*` 可覆盖 | + ## 统一属性模型 各引擎 cache entry 使用统一的配置键格式: -`meta.cache...{enable,ttl-second,capacity}` +`meta.cache...{enable,ttl-second,capacity}` 下表说明属性语义: | 属性 | 示例 | 含义 | |---|---|---| -| `enable` | `true/false` | 是否启用该缓存 module。 | +| `enable` | `true/false` | 是否启用该缓存 entry。 | | `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭;`-1` 表示永不过期;其他值表示按访问时间计算 TTL。 | | `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | +**生效逻辑说明:** +只有当 `enable=true` 且 `ttl-second > 0`(或为 -1)且 `capacity > 0` 时,该模块缓存才会真正生效(对应观测表中的 `EFFECTIVE_ENABLED = true`)。 + 说明: -- `` 使用 catalog 文档和 stats 表中展示的 cache entry 名,例如 `partition_values`、`fs_view`、`meta_client`。 +- `` 使用 catalog 文档和 stats 表中展示的 cache entry 名,例如 `partition_values`、`fs_view`、`meta_client`。 - 当前没有 per-entry 的刷新周期参数。异步刷新周期仍由 FE 配置 `external_cache_refresh_time_minutes` 统一控制。 示例: ```sql ALTER CATALOG hive_ctl SET PROPERTIES ( + -- 将 Hive 的文件列表缓存 TTL 设置为 0,即刻关闭该缓存 "meta.cache.hive.file.ttl-second" = "0" ); ``` -## 外表 Meta Cache 覆盖范围 - -这里有两层缓存,比较容易混淆: - -- Catalog 对象/名称缓存:如 `SHOW DATABASES`、`SHOW TABLES`、库对象、表对象等,见 [元数据缓存](../meta-cache.md)。 -- 引擎 entry 缓存:如 Hive 分区/文件、Iceberg manifest、Paimon table handle、schema entry 等。本文主要讲这一层。 - -外表元数据 cache entry 覆盖多种元数据类型。其中一部分由统一 `meta.cache.*` 键配置,另一部分同时继承 FE 级默认值: - -| 类别 | 示例 | 配置方式 | -|---|---|---| -| 引擎 entry 缓存 | Hive `partition_values` / `partition` / `file`、Iceberg `manifest`、Paimon `table` 等 | Catalog `PROPERTIES`:`meta.cache...*` | -| Schema cache | 各引擎自己的 `schema` entry,按 schema version token 隔离 | FE 配置提供默认值,Catalog `meta.cache..schema.*` 可覆盖 | - -## 支持矩阵 +## 支持矩阵与配置入口 -下面的表总结了当前实现状态: +下面的表总结了各引擎支持的缓存项,以及是否支持热生效,并提供了具体配置说明的链接: -| 引擎 | 在 stats 表里能看到的 entry | 属性键前缀 | `ALTER CATALOG ... SET PROPERTIES` 热生效 | +| Catalog 引擎 | 在 stats 表里能看到的 entry (``) | `ALTER CATALOG ... SET PROPERTIES` 热生效 | 详细配置说明链接 | |---|---|---|---| -| Hive | `schema`、`partition_values`、`partition`、`file` | `meta.cache.hive..*` | `meta.cache.hive.*` 的变更不会通过统一热生效路径应用;需重建 catalog 或重启 FE 后生效 | -| Iceberg | `schema`、`table`、`view`、`manifest` | `meta.cache.iceberg..*` | 支持 | -| Paimon | `schema`、`table` | `meta.cache.paimon..*` | 支持 | -| Hudi | `schema`、`partition`、`fs_view`、`meta_client` | `meta.cache.hudi..*` | 支持,通过 HMS catalog 属性更新路径生效 | -| MaxCompute | `schema`、`partition_values` | `meta.cache.maxcompute..*` | 没有专门的热生效 hook | - -## 各类 Catalog 的配置入口(链接) - -不同 Catalog 引擎支持的缓存 module 不同,具体 module、推荐配置与可观测性请参考对应 Catalog 文档: - -| Catalog 引擎 | module 缓存配置与可观测性 | -|---|---| -| Hive | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-unified) | -| Iceberg | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-unified) | -| Paimon | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-unified) | -| Hudi | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-unified) | -| MaxCompute | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-unified) | +| Hive | `schema`、`partition_values`、`partition`、`file` | `meta.cache.hive.*` 的变更不会通过统一热生效路径应用;需重建 catalog 或重启 FE 后生效 | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-unified) | +| Iceberg | `schema`、`table`、`view`、`manifest` | 支持 | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-unified) | +| Paimon | `schema`、`table` | 支持 | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-unified) | +| Hudi | `schema`、`partition`、`fs_view`、`meta_client` | 支持,通过 HMS catalog 属性更新路径生效 | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-unified) | +| MaxCompute | `schema`、`partition_values` | 没有专门的热生效 hook | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-unified) | + +:::caution +**Hive Catalog 注意事项**:Hive 的 `meta.cache.hive.*` 属性修改**不支持热生效**。修改配置后,必须重建 Catalog 或重启 FE 节点才能应用新的缓存配置。 +::: ## 观测方式 @@ -104,6 +100,8 @@ ORDER BY catalog_name, engine_name, entry_name; | `ENGINE_NAME` | 缓存引擎,如 `hive`、`iceberg` | | `ENTRY_NAME` | 该引擎下的精确 entry 名,如 `partition_values`、`fs_view`、`manifest` | | `EFFECTIVE_ENABLED` | 综合 `enable`、`ttl-second`、`capacity` 后最终是否生效 | +| `LOAD_FAILURE_COUNT` | 从外部系统加载数据的失败次数。当查询变慢或报错时,可优先查看此字段排查上游系统异常。 | +| `LAST_ERROR` | 最后一次加载失败的错误信息。对排查 HMS、S3 等超时或连接异常极其有用。 | 常见查询方式是按 `catalog_name` 和 `engine_name` 过滤。该系统表不再使用旧的 `cache_name` / `metric_name` 透视模型。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/catalog/REFRESH.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/catalog/REFRESH.md index 47e18785778bb..dfd23011ff3b1 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/catalog/REFRESH.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/catalog/REFRESH.md @@ -13,7 +13,8 @@ ## 语法 ```sql -REFRESH CATALOG ; +REFRESH CATALOG + [PROPERTIES ("invalid_cache" = "true" | "false")]; REFRESH DATABASE [.]; REFRESH TABLE [[.].]; ``` @@ -42,6 +43,13 @@ REFRESH TABLE [[.].]; ## 注意事项 刷新 Catalog 的同时,会强制使对象相关的 Cache 失效。包括 Partition Cache、Schema Cache、File Cache 等。 +`invalid_cache` 用于控制 `REFRESH CATALOG` 时是否继续失效更细粒度的元数据缓存: + +- `true`:失效 catalog 对象缓存,以及分区、schema、文件列表等更细粒度缓存。这也是默认行为。 +- `false`:只刷新 catalog 级对象/名称元数据,保留更细粒度缓存。 + +`invalid_cache` 当前用于 `REFRESH CATALOG`。 + ## 示例 1. 刷新 hive catalog @@ -50,14 +58,20 @@ REFRESH TABLE [[.].]; REFRESH CATALOG hive; ``` -2. 刷新 database1 +2. 刷新 hive catalog,但不失效更细粒度缓存 + + ```sql + REFRESH CATALOG hive PROPERTIES("invalid_cache" = "false"); + ``` + +3. 刷新 database1 ```sql REFRESH DATABASE ctl.database1; REFRESH DATABASE database1; ``` -3. 刷新 table1 +4. 刷新 table1 ```sql REFRESH TABLE ctl.db.table1; @@ -65,4 +79,3 @@ REFRESH TABLE [[.].]; REFRESH TABLE table1; ``` - From 31d32cfc3613460c087a0c5b2de0d11262481127 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 12 Mar 2026 10:40:15 +0800 Subject: [PATCH 5/8] Refine lakehouse meta cache docs --- docs/lakehouse/catalogs/hive-catalog.mdx | 82 +++++++------ docs/lakehouse/catalogs/hudi-catalog.md | 73 +++++++----- docs/lakehouse/catalogs/iceberg-catalog.mdx | 84 +++++++------ docs/lakehouse/catalogs/maxcompute-catalog.md | 63 ++++++---- docs/lakehouse/catalogs/paimon-catalog.mdx | 68 ++++++----- docs/lakehouse/meta-cache.md | 53 +-------- .../meta-cache/unified-meta-cache.md | 110 ------------------ .../current/lakehouse/catalog-overview.md | 3 +- .../lakehouse/catalogs/hive-catalog.mdx | 82 ++++++++----- .../lakehouse/catalogs/hudi-catalog.md | 75 +++++++----- .../lakehouse/catalogs/iceberg-catalog.mdx | 86 ++++++++------ .../lakehouse/catalogs/maxcompute-catalog.md | 64 ++++++---- .../lakehouse/catalogs/paimon-catalog.mdx | 69 ++++++----- .../current/lakehouse/meta-cache.md | 59 ++-------- .../meta-cache/unified-meta-cache.md | 110 ------------------ 15 files changed, 465 insertions(+), 616 deletions(-) delete mode 100644 docs/lakehouse/meta-cache/unified-meta-cache.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md diff --git a/docs/lakehouse/catalogs/hive-catalog.mdx b/docs/lakehouse/catalogs/hive-catalog.mdx index 3e17dde7901d0..a7bf9c4829d1a 100644 --- a/docs/lakehouse/catalogs/hive-catalog.mdx +++ b/docs/lakehouse/catalogs/hive-catalog.mdx @@ -76,62 +76,78 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering common attributes. Please see the "Common Properties" section in the [Catalog Overview](../catalog-overview.md). -## Metadata Cache (4.1.x+) {#meta-cache-unified} +## Metadata Cache {#meta-cache} -Starting from Doris 4.1.x, Hive Catalog metadata caches are configured with the unified `meta.cache.*` properties. -This section covers configuration and observability for Hive-related cache modules. +To improve the performance of accessing external data sources, Apache Doris caches Hive metadata. Metadata includes table structure (Schema), partition lists, partition properties, and file lists. -For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Hive Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: -### Cache Modules {#meta-cache-unified-modules} +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} -| Module | Property key prefix | Cached content (typical) | +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | |---|---|---| -| `schema` | `meta.cache.hive.schema.` | Schema cache entry for table schema loading. | -| `partition_values` | `meta.cache.hive.partition_values.` | Partition values/names list used by partition pruning and partition enumeration. | -| `partition` | `meta.cache.hive.partition.` | Partition properties (location, input format, storage descriptor, etc.). | -| `file` | `meta.cache.hive.file.` | File listing under partition/table paths (reduces remote LIST overhead). | +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | -Notes: +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. -- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. -- Changing legacy properties such as `file.meta.cache.ttl-second` and `partition.cache.ttl-second` can trigger Hive cache rebuild behavior. -- Changing unified `meta.cache.hive.*` properties on an already-initialized catalog does not fully hot-reload existing Hive cache entries in current releases. To guarantee that a new cache spec is applied, recreate the catalog or restart FE. +### Cache Modules {#meta-cache-unified-modules} -Example: +Hive Catalog includes the following cache modules: -```sql -ALTER CATALOG hive_ctl SET PROPERTIES ( - "meta.cache.hive.file.ttl-second" = "0" -); -``` +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.hive.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition_values` | `meta.cache.hive.partition_values.` | Caches partition values/names list. Impact: Partition pruning and enumeration. If disabled, new external partitions can be seen in real-time. | +| `partition` | `meta.cache.hive.partition.` | Caches partition properties (Location, input format, etc.). Impact: Specific metadata of partitions. | +| `file` | `meta.cache.hive.file.` | Caches file lists. Impact: Reduces remote LIST operation overhead. If disabled, file changes can be seen in real-time. | -### Observability {#meta-cache-unified-observability} +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} -Hive cache metrics are available in `information_schema.catalog_meta_cache_statistics`. -For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: -Common Hive entries: +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.hive.schema.ttl-second` | Expiration time of table structure cache | +| `partition.cache.ttl-second` | `meta.cache.hive.partition_values.ttl-second` | Expiration time of partition value cache | +| `file.meta.cache.ttl-second` | `meta.cache.hive.file.ttl-second` | Expiration time of file list cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest partition or file changes in the external data source, you can set the corresponding `ttl-second` to `0`. + ```sql + -- Disable file list cache to see file changes in real-time + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.file.ttl-second" = "0"); + -- Disable partition value cache to see new partitions in real-time + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.partition_values.ttl-second" = "0"); + ``` +* **Performance optimization**: For scenarios where metadata changes are infrequent, it is recommended to appropriately increase `capacity` and `ttl-second` to reduce access pressure on Hive Metastore and file systems. -| Entry | Meaning | -|---|---| -| `schema` | Schema cache entry | -| `partition_values` | Partition names / values cache entry | -| `partition` | Partition property cache entry | -| `file` | File listing cache entry | +:::caution +**Hive Catalog Note**: Changes to `meta.cache.hive.*` properties **do not support hot-reload**. To ensure new configurations take effect, you must recreate the catalog or restart the FE node. +::: -Example query: +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'hive_ctl' - AND engine_name = 'hive' +WHERE catalog_name = 'hive_ctl' AND engine_name = 'hive' ORDER BY entry_name; ``` +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Hive Versions Supports Hive 1.x, 2.x, 3.x, and 4.x. diff --git a/docs/lakehouse/catalogs/hudi-catalog.md b/docs/lakehouse/catalogs/hudi-catalog.md index eaf5046e2734f..7ac444f9b5f1f 100644 --- a/docs/lakehouse/catalogs/hudi-catalog.md +++ b/docs/lakehouse/catalogs/hudi-catalog.md @@ -51,61 +51,70 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | Whether to use the partition information already synchronized by Hive Metastore. If true, partition information will be obtained directly from Hive Metastore. Otherwise, it will be obtained from the metadata file of the file system. Obtaining information from Hive Metastore is more efficient, but users need to ensure that the latest metadata has been synchronized to Hive Metastore. | false | -## Metadata Cache (4.1.x+) {#meta-cache-unified} +## Metadata Cache {#meta-cache} -Starting from Doris 4.1.x, Hudi-related metadata caches are configured with the unified `meta.cache.*` properties. -This section covers configuration and observability for Hudi-related cache modules. +To improve the performance of accessing external data sources, Apache Doris caches Hudi metadata. Metadata includes table structure (Schema), partition information, FS View, and Meta Client objects. -For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Hudi-related external metadata cache is configured using the unified `meta.cache.*` keys. +::: -### Cache Modules {#meta-cache-unified-modules} +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. -| Module | Property key prefix | Cached content (typical) | +| Property | Example | Meaning | |---|---|---| -| `schema` | `meta.cache.hudi.schema.` | Schema cache entry for table schema loading. | -| `partition` | `meta.cache.hudi.partition.` | Hudi partition-related metadata (used by partition discovery/pruning). | -| `fs_view` | `meta.cache.hudi.fs_view.` | Hudi filesystem view related metadata. | -| `meta_client` | `meta.cache.hudi.meta_client.` | Hudi meta client related metadata. | +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | -Notes: +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. -- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. -- When Hudi tables are accessed through an HMS catalog, configure `meta.cache.hudi.*` on that HMS catalog. +### Cache Modules {#meta-cache-unified-modules} -Example: +Hudi Catalog includes the following cache modules: -```sql -ALTER CATALOG hudi_ctl SET PROPERTIES ( - "meta.cache.hudi.fs_view.capacity" = "2000" -); -``` +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.hudi.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition` | `meta.cache.hudi.partition.` | Caches Hudi partition-related metadata. Impact: Used for partition discovery and pruning. | +| `fs_view` | `meta.cache.hudi.fs_view.` | Caches Hudi filesystem view related metadata. | +| `meta_client` | `meta.cache.hudi.meta_client.` | Caches Hudi Meta Client objects. Impact: Reduces redundant loading of Hudi metadata. | -### Observability {#meta-cache-unified-observability} +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.hudi.schema.ttl-second` | Expiration time of table structure cache | -Hudi cache metrics are available in `information_schema.catalog_meta_cache_statistics`. -For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). +### Best Practices {#meta-cache-best-practices} -Common Hudi entries: +* **Real-time access to the latest data**: If you want each query to see the latest data changes or schema changes for Hudi tables, you can set the `ttl-second` for `schema` or `partition` to `0`. + ```sql + -- Disable partition metadata cache to detect the latest partition changes in Hudi tables + ALTER CATALOG hudi_ctl SET PROPERTIES ("meta.cache.hudi.partition.ttl-second" = "0"); + ``` +* **Performance optimization**: Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Hudi (via the HMS catalog property update path). -| Entry | Meaning | -|---|---| -| `schema` | Schema cache entry | -| `partition` | Partition metadata cache entry | -| `fs_view` | File system view cache entry | -| `meta_client` | Meta client cache entry | +### Observability {#meta-cache-unified-observability} -Example query: +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'hudi_ctl' - AND engine_name = 'hudi' +WHERE catalog_name = 'hudi_ctl' AND engine_name = 'hudi' ORDER BY entry_name; ``` +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Hudi Versions The current dependent Hudi version is 0.15. It is recommended to access Hudi data version 0.14 and above. diff --git a/docs/lakehouse/catalogs/iceberg-catalog.mdx b/docs/lakehouse/catalogs/iceberg-catalog.mdx index a54ec699b00b7..13a04898323a3 100644 --- a/docs/lakehouse/catalogs/iceberg-catalog.mdx +++ b/docs/lakehouse/catalogs/iceberg-catalog.mdx @@ -85,64 +85,78 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering general properties. See the [Catalog Overview](../catalog-overview.md) for details on common properties. -## Metadata Cache (4.1.x+) {#meta-cache-unified} +## Metadata Cache {#meta-cache} -Starting from Doris 4.1.x, Iceberg Catalog metadata caches are configured with the unified `meta.cache.*` properties. -This section covers configuration and observability for Iceberg-related cache modules. +To improve the performance of accessing external data sources, Apache Doris caches Iceberg metadata. Metadata includes table structure (Schema), table objects, view objects, and manifest details. -For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Iceberg Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: -### Cache Modules {#meta-cache-unified-modules} +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} -| Module | Property key prefix | Cached content (typical) | +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | |---|---|---| -| `schema` | `meta.cache.iceberg.schema.` | Schema cache entry for table schema loading. | -| `table` | `meta.cache.iceberg.table.` | Iceberg table metadata object (reduces catalog/metastore round trips). | -| `view` | `meta.cache.iceberg.view.` | Iceberg view metadata object. | -| `manifest` | `meta.cache.iceberg.manifest.` | Manifest-related metadata (reduces repeated manifest access overhead). | +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | -Notes: +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. -- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. -- `manifest` is disabled by default in the current implementation. Enable it explicitly before tuning TTL/capacity. -- `view` entries are only populated when Doris accesses Iceberg views. -- `ALTER CATALOG ... SET PROPERTIES` updates are applied through the unified hot-reload path. +### Cache Modules {#meta-cache-unified-modules} -Example: +Iceberg Catalog includes the following cache modules: -```sql -ALTER CATALOG iceberg_ctl SET PROPERTIES ( - "meta.cache.iceberg.manifest.enable" = "true", - "meta.cache.iceberg.manifest.ttl-second" = "600" -); -``` +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `table` | `meta.cache.iceberg.table.` | Caches Iceberg table metadata objects. Impact: Reduces Catalog/Metastore round-trips. | +| `view` | `meta.cache.iceberg.view.` | Caches Iceberg View metadata objects. | +| `manifest` | `meta.cache.iceberg.manifest.` | Caches manifest details. Impact: Reduces repeated manifest access overhead. Note: This module is disabled by default and must be enabled manually. | -### Observability {#meta-cache-unified-observability} +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.iceberg.schema.ttl-second` | Expiration time of table structure cache | -Iceberg cache metrics are available in `information_schema.catalog_meta_cache_statistics`. -For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). +### Best Practices {#meta-cache-best-practices} -Common Iceberg entries: +* **Real-time access to the latest data**: If you want each query to see the latest snapshots or schema changes for Iceberg tables, you can set the `ttl-second` for `schema` or `table` to `0`. + ```sql + -- Disable table object cache to detect snapshot changes + ALTER CATALOG iceberg_ctl SET PROPERTIES ("meta.cache.iceberg.table.ttl-second" = "0"); + ``` +* **Performance optimization**: + * Enabling manifest cache can significantly speed up query planning for large tables: + ```sql + ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", + "meta.cache.iceberg.manifest.ttl-second" = "600" + ); + ``` + * Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Iceberg Catalog. -| Entry | Meaning | -|---|---| -| `schema` | Schema cache entry | -| `table` | Table metadata cache entry | -| `view` | View metadata cache entry | -| `manifest` | Manifest payload cache entry | +### Observability {#meta-cache-unified-observability} -Example query: +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'iceberg_ctl' - AND engine_name = 'iceberg' +WHERE catalog_name = 'iceberg_ctl' AND engine_name = 'iceberg' ORDER BY entry_name; ``` +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Iceberg Versions | Doris Version | Iceberg SDK Version | diff --git a/docs/lakehouse/catalogs/maxcompute-catalog.md b/docs/lakehouse/catalogs/maxcompute-catalog.md index 186f2cc41e67f..4e772ed0186e8 100644 --- a/docs/lakehouse/catalogs/maxcompute-catalog.md +++ b/docs/lakehouse/catalogs/maxcompute-catalog.md @@ -111,51 +111,68 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the "Common Properties" section in [Catalog Overview](../catalog-overview.md). -## Metadata Cache (4.1.x+) {#meta-cache-unified} +## Metadata Cache {#meta-cache} -Starting from Doris 4.1.x, MaxCompute Catalog metadata caches are configured with the unified `meta.cache.*` properties. -This section covers configuration and observability for MaxCompute-related cache modules. +To improve the performance of accessing external data sources, Apache Doris caches MaxCompute metadata. Metadata includes table structure (Schema) and partition lists. -For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, MaxCompute Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. ### Cache Modules {#meta-cache-unified-modules} -| Module | Property key prefix | Cached content (typical) | +MaxCompute Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | |---|---|---| -| `schema` | `meta.cache.maxcompute.schema.` | Schema cache entry for table schema loading. | -| `partition_values` | `meta.cache.maxcompute.partition_values.` | Partition values cache entry used for partition pruning and partition enumeration. | +| `schema` | `meta.cache.maxcompute.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | Caches partition value lists. Impact: Partition pruning and enumeration. If disabled, new external partitions can be seen in real-time. | -Notes: +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} -- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. -- `partition_values` is configured through `meta.cache.maxcompute.partition_values.*`. -- The stats table exposes `partition_values` and `schema` as the two MaxCompute entries. -- There is no dedicated MaxCompute catalog-level hot-reload hook for `meta.cache.maxcompute.*`. +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: -### Observability {#meta-cache-unified-observability} +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.maxcompute.schema.ttl-second` | Expiration time of table structure cache | -MaxCompute cache metrics are available in `information_schema.catalog_meta_cache_statistics`. -For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). +### Best Practices {#meta-cache-best-practices} -Common MaxCompute entries: +* **Real-time access to the latest data**: If you want each query to see the latest partition or schema changes for MaxCompute tables, you can set the `ttl-second` for `schema` or `partition_values` to `0`. + ```sql + -- Disable partition value cache to detect the latest partitions in MaxCompute tables + ALTER CATALOG mc_ctl SET PROPERTIES ("meta.cache.maxcompute.partition_values.ttl-second" = "0"); + ``` +* **Note**: `meta.cache.maxcompute.*` currently does not have a dedicated hot-reload hook. After changing the configuration, it is recommended to recreate the Catalog or restart FE to ensure it takes effect. -| Entry | Meaning | -|---|---| -| `schema` | Schema cache entry | -| `partition_values` | Partition values cache entry | +### Observability {#meta-cache-unified-observability} -Example query: +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'mc_ctl' - AND engine_name = 'maxcompute' +WHERE catalog_name = 'mc_ctl' AND engine_name = 'maxcompute' ORDER BY entry_name; ``` +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported MaxCompute Versions Only the public cloud version of MaxCompute is supported. For private cloud version support, please contact Doris community support. diff --git a/docs/lakehouse/catalogs/paimon-catalog.mdx b/docs/lakehouse/catalogs/paimon-catalog.mdx index 0f2e5d0d97d6a..5f95428628772 100644 --- a/docs/lakehouse/catalogs/paimon-catalog.mdx +++ b/docs/lakehouse/catalogs/paimon-catalog.mdx @@ -90,58 +90,68 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the [Catalog Overview](../catalog-overview.md) section on [Common Properties]. -## Metadata Cache (4.1.x+) {#meta-cache-unified} +## Metadata Cache {#meta-cache} -Starting from Doris 4.1.x, Paimon Catalog metadata caches are configured with the unified `meta.cache.*` properties. -This section covers configuration and observability for Paimon-related cache modules. +To improve the performance of accessing external data sources, Apache Doris caches Paimon metadata. Metadata includes table structure (Schema) and table objects. -For the unified property semantics, see: [Unified External Meta Cache (4.1.x+)](../meta-cache/unified-meta-cache.md). +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Paimon Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: -### Cache Modules {#meta-cache-unified-modules} +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} -| Module | Property key prefix | Cached content (typical) | +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | |---|---|---| -| `schema` | `meta.cache.paimon.schema.` | Schema cache entry for table schema loading. | -| `table` | `meta.cache.paimon.table.` | Paimon table metadata used for query planning (schema/snapshot/partition related metadata, depending on workload). | +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | -Notes: +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. -- Property keys use the module names shown above. The same names appear as `ENTRY_NAME` in `information_schema.catalog_meta_cache_statistics`. -- `schema` and `table` are separate entries. `schema` uses FE defaults unless `meta.cache.paimon.schema.*` is configured on the catalog. -- `ALTER CATALOG ... SET PROPERTIES` updates are applied through the unified hot-reload path. +### Cache Modules {#meta-cache-unified-modules} -Example: +Paimon Catalog includes the following cache modules: -```sql -ALTER CATALOG paimon_ctl SET PROPERTIES ( - "meta.cache.paimon.table.ttl-second" = "0" -); -``` +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.paimon.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `table` | `meta.cache.paimon.table.` | Caches Paimon table metadata objects. Impact: Reduces metadata loading overhead during query planning. | -### Observability {#meta-cache-unified-observability} +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} -Paimon cache metrics are available in `information_schema.catalog_meta_cache_statistics`. -For the table definition and metric meanings, see: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: -Common Paimon entries: +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.paimon.schema.ttl-second` | Expiration time of table structure cache | -| Entry | Meaning | -|---|---| -| `schema` | Schema cache entry | -| `table` | Table metadata cache entry | +### Best Practices {#meta-cache-best-practices} -Example query: +* **Real-time access to the latest data**: If you want each query to see the latest data changes or schema changes for Paimon tables, you can set the `ttl-second` for `schema` or `table` to `0`. + ```sql + -- Disable table object cache to detect the latest snapshots of Paimon tables + ALTER CATALOG paimon_ctl SET PROPERTIES ("meta.cache.paimon.table.ttl-second" = "0"); + ``` +* **Performance optimization**: Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Paimon Catalog. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'paimon_ctl' - AND engine_name = 'paimon' +WHERE catalog_name = 'paimon_ctl' AND engine_name = 'paimon' ORDER BY entry_name; ``` +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Paimon Versions The currently dependent Paimon version is 1.0.0. diff --git a/docs/lakehouse/meta-cache.md b/docs/lakehouse/meta-cache.md index a1dbca3ab662f..5a720093b96bf 100644 --- a/docs/lakehouse/meta-cache.md +++ b/docs/lakehouse/meta-cache.md @@ -6,32 +6,18 @@ } --- -To improve the performance of accessing external data sources, Apache Doris caches the **metadata** of external data sources. - -Metadata includes information such as databases, tables, columns, partitions, snapshots, file lists, etc. - -This article details the types, strategies, and related parameter configurations of cached metadata. - -For **data cache**, refer to the [data cache documentation](./data-cache.md). - :::tip -This document applies to versions after 2.1.6. +This document applies to versions before 4.1.x. +For Doris 4.1.x and later, external meta cache has been refactored with unified configuration keys `meta.cache.*`. Please refer to the "Metadata Cache" section in each [Catalog](./catalog-overview.md) document. ::: -:::note -For Doris 4.1.x and later, external meta cache has been refactored with unified configuration keys `meta.cache.*`. -See [Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). - -Starting from Doris 4.1.x, external metadata caching can be understood as two layers: +To improve the performance of accessing external data sources, Apache Doris caches the **metadata** of external data sources. -- Generic catalog caches: database/table name lists and database/table objects. These are still controlled by FE configs such as `max_meta_object_cache_num`, `external_cache_refresh_time_minutes`, and `external_cache_expire_time_seconds_after_access`. -- Engine-specific entry caches: schema, partition metadata, manifests, file lists, and similar engine-dependent entries. These use unified per-catalog keys in the form `meta.cache...{enable,ttl-second,capacity}`. +Metadata includes information such as databases, tables, columns, partitions, snapshots, file lists, etc. -The unified document focuses on the second layer. -::: +This article details the types, strategies, and related parameter configurations of cached metadata for legacy versions (pre-4.1). -This page mainly records FE-level defaults and legacy catalog properties used by the 2.1.x / 3.x cache model. -For the current engine-specific cache entry matrix in Doris 4.1.x+, use the unified page and the catalog-specific pages. +For **data cache**, refer to the [data cache documentation](./data-cache.md). ## Cache Strategies @@ -333,11 +319,6 @@ This section mainly introduces the cache behavior that users may be concerned ab For all types of External Catalogs, if you want to see the latest Table Schema in real time, you can disable the Schema Cache: -:::note -For Doris 4.1.x+, prefer the unified per-catalog property `meta.cache..schema.ttl-second = "0"`. -See [Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). -::: - - Disable globally ```text @@ -345,13 +326,6 @@ See [Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). max_external_schema_cache_num=0 // Disable Schema cache. ``` -- Disable at Catalog level in Doris 4.1.x+ - - ```text - -- Catalog property - "meta.cache..schema.ttl-second" = "0" - ``` - - Legacy catalog-level property ```text @@ -365,12 +339,6 @@ After setting, Doris will see the latest Table Schema in real time. However, thi For Hive Catalog, if you want to disable the cache to query real-time updated data, you can configure the following parameters: -:::note -For Doris 4.1.x+, prefer unified `meta.cache.hive.*` properties. See: -[Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-unified) and -[Unified External Meta Cache (4.1.x+)](./meta-cache/unified-meta-cache.md). -::: - - Disable globally ```text @@ -380,15 +348,6 @@ For Doris 4.1.x+, prefer unified `meta.cache.hive.*` properties. See: max_hive_partition_cache_num=0 // Disable partition property cache ``` -- Disable at Catalog level in Doris 4.1.x+ - - ```text - -- Catalog property - "meta.cache.hive.partition_values.ttl-second" = "0" // Disable partition list cache - "meta.cache.hive.partition.ttl-second" = "0" // Disable partition property cache - "meta.cache.hive.file.ttl-second" = "0" // Disable file list cache - ``` - - Legacy catalog-level properties ```text diff --git a/docs/lakehouse/meta-cache/unified-meta-cache.md b/docs/lakehouse/meta-cache/unified-meta-cache.md deleted file mode 100644 index 97a9e7c4ee536..0000000000000 --- a/docs/lakehouse/meta-cache/unified-meta-cache.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -{ - "title": "Unified External Meta Cache (4.1.x+)", - "language": "en", - "description": "User guide for unified external metadata cache: unified meta.cache.* properties, what is cached, and where to configure per catalog." -} ---- - -Starting from **Doris 4.1.x**, external metadata caching is unified for major External Catalog engines. The unified cache standardizes configuration models and monitoring metrics across different data lake engines (like Hive, Iceberg, etc.), reducing the configuration threshold and troubleshooting difficulty for multi-source data management. - -As a user, you mainly need to care about three things: - -- **What it affects:** Depends on the catalog engine (partitions, file listing, table metadata, manifests, etc.). -- **Where to configure:** Catalog `PROPERTIES` with unified `meta.cache.*` keys (see the catalog pages linked below). -- **How to observe:** `information_schema.catalog_meta_cache_statistics` system table (see the observability section below). - -:::tip -Applies to Doris 4.1.x and later. -::: - -## What External Meta Cache Includes - -Before configuring, it's important to understand what is actually being cached. There are two layers of metadata caching that are easy to confuse: - -- **Catalog object/name caches:** `SHOW DATABASES`, `SHOW TABLES`, database objects, table objects, and related generic caches described in [Metadata Cache](../meta-cache.md). -- **Engine entry caches:** Engine-specific runtime metadata such as Hive partitions/files, Iceberg manifests, Paimon table handles, and schema entries. This page focuses on this layer. - -External meta cache entries cover different kinds of metadata. Some are configured by unified catalog properties, and some also inherit FE-level defaults: - -| Category | Examples | How to configure | -|---|---|---| -| Engine entry caches | Hive `partition_values` / `partition` / `file`, Iceberg `manifest`, Paimon `table`, etc. | Catalog `PROPERTIES`: `meta.cache...*` | -| Schema cache | Per-engine `schema` entry, isolated by schema version token | FE configs provide defaults; catalog `meta.cache..schema.*` can override them | - -## Unified Property Model - -All engine cache entries share the same property key pattern: - -`meta.cache...{enable,ttl-second,capacity}` - -The following table describes the property semantics: - -| Property | Example | Meaning | -|---|---|---| -| `enable` | `true/false` | Whether this cache entry is enabled. | -| `ttl-second` | `600`, `0`, `-1` | `0` disables the entry; `-1` means no expiration; otherwise expire after access by TTL. | -| `capacity` | `10000` | Max entry count (count-based). `0` disables the entry. | - -**Note on Effective State:** -Only when `enable=true` AND `ttl-second > 0` (or `-1`) AND `capacity > 0`, the cache entry will be truly effective (corresponding to `EFFECTIVE_ENABLED = true` in the observability table). - -Notes: - -- `` uses the cache entry name shown in the catalog documentation and the stats table, for example `partition_values`, `fs_view`, `meta_client`. -- There is currently no per-entry refresh interval property. Async refresh behavior still uses the FE config `external_cache_refresh_time_minutes`. - -Example: - -```sql -ALTER CATALOG hive_ctl SET PROPERTIES ( - -- Set the TTL of Hive file cache to 0, which immediately disables this cache entry - "meta.cache.hive.file.ttl-second" = "0" -); -``` - -## Supported Engines and Configurations - -The following table summarizes the current implementation and links to catalog-specific configuration pages: - -| Catalog Engine | Entries you will see in stats (``) | `ALTER CATALOG ... SET PROPERTIES` hot-reload | Detailed Configuration Guide | -|---|---|---|---| -| Hive | `schema`, `partition_values`, `partition`, `file` | Changes to `meta.cache.hive.*` are not applied through the unified hot-reload path; recreate the catalog or restart FE to apply new specs | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-unified) | -| Iceberg | `schema`, `table`, `view`, `manifest` | Supported | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-unified) | -| Paimon | `schema`, `table` | Supported | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-unified) | -| Hudi | `schema`, `partition`, `fs_view`, `meta_client` | Supported through HMS catalog property updates | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-unified) | -| MaxCompute | `schema`, `partition_values` | No dedicated hot-reload hook | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-unified) | - -:::caution -For **Hive Catalogs**, changes to `meta.cache.hive.*` properties via `ALTER CATALOG` do **not** take effect dynamically. You must recreate the catalog or restart the Frontend (FE) node to apply the new configurations. -::: - -## Observability - -Use the system table to observe cache metrics: - -```sql -SELECT catalog_name, engine_name, entry_name, - effective_enabled, ttl_second, capacity, - estimated_size, hit_rate, load_failure_count, last_error -FROM information_schema.catalog_meta_cache_statistics -ORDER BY catalog_name, engine_name, entry_name; -``` - -This table is documented at: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). - -Read the table as follows: - -| Field | Convention | -|---|---| -| `ENGINE_NAME` | Cache engine, such as `hive` or `iceberg` | -| `ENTRY_NAME` | Exact entry name used by that engine, such as `partition_values`, `fs_view`, `manifest` | -| `EFFECTIVE_ENABLED` | Final enable state after evaluating `enable`, `ttl-second`, and `capacity` | -| `LOAD_FAILURE_COUNT` | Number of failed loads from external systems. Useful for identifying upstream metadata service issues. | -| `LAST_ERROR` | The exception message of the last failed load. Highly valuable for troubleshooting timeout or connection errors with HMS, S3, etc. | - -Common queries filter by `catalog_name` and `engine_name`. This table no longer uses the old `cache_name` / `metric_name` pivoted model. - -## Migration Note (Legacy Properties) - -Starting from Doris 4.1.x, legacy catalog cache properties (for example, `schema.cache.ttl-second`, `file.meta.cache.ttl-second`) are deprecated. Use `meta.cache.*` properties instead and follow the catalog-specific pages above. diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md index 5b096d18f0096..a071dbd83d2c8 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md @@ -324,7 +324,8 @@ REFRESH TABLE catalog_name.db_name.table_name; Doris 也支持关闭元数据缓存,以便能够实时访问到最新的元数据。 -关于元数据缓存的详细介绍和配置,请参阅:[元数据缓存](./meta-cache.md) +- Doris 4.1.x 之前:请参阅[元数据缓存](./meta-cache.md)。 +- Doris 4.1.x 及之后:请参阅各 Catalog 文档中的“元数据缓存”章节,例如 [Hive Catalog](./catalogs/hive-catalog.md#meta-cache)、[Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache)、[Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache)、[Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache)、[MaxCompute Catalog](./catalogs/maxcompute-catalog.md#meta-cache)。 ## 修改数据目录 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx index 40d4a93b842c6..71f0e90a0f389 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx @@ -78,62 +78,82 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[ 数据目录概述 ](../catalog-overview.md)中【通用属性】部分。 -## 元数据缓存(4.1.x+) {#meta-cache-unified} +## 元数据缓存 {#meta-cache} +为了提升访问外部数据源的性能,Apache Doris 会对 Hive 的元数据进行缓存。元数据包括表结构(Schema)、分区列表、分区属性和文件列表等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 从 Doris 4.1.x 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 -本节说明 Hive 相关 cache 模块的配置与观测方式。 +::: + +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 -统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 ### 缓存模块 {#meta-cache-unified-modules} -| 模块 | 属性键前缀 | 典型缓存内容 | +Hive Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | |---|---|---| -| `schema` | `meta.cache.hive.schema.` | 表 schema 加载对应的 schema cache entry。 | -| `partition_values` | `meta.cache.hive.partition_values.` | 分区值/分区名称列表(常用于分区剪枝与分区枚举)。 | -| `partition` | `meta.cache.hive.partition.` | 分区属性(location、输入格式、存储描述等)。 | -| `file` | `meta.cache.hive.file.` | 分区/表路径下的文件列表(减少远端 LIST 开销)。 | +| `schema` | `meta.cache.hive.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition_values` | `meta.cache.hive.partition_values.` | 缓存分区值/分区名称列表。影响:`SHOW PARTITIONS`、分区枚举、分区裁剪,以及外部新增/删除分区何时在 Doris 中可见。若关闭,可实时查看到分区变动。 | +| `partition` | `meta.cache.hive.partition.` | 缓存分区属性(Location、InputFormat、Serde 等)。影响:单个分区位置、格式、Serde 等属性变更的可见性。 | +| `file` | `meta.cache.hive.file.` | 缓存文件列表。影响:新增/删除文件、文件大小变化被 Doris 感知的时效性,同时减少远端 LIST 操作开销。若关闭,每次查询都会重新加载文件列表。 | -说明: +### 旧参数映射与转换 {#meta-cache-mapping} -- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 -- 修改旧参数 `file.meta.cache.ttl-second`、`partition.cache.ttl-second` 时,可以触发 Hive cache 重建相关路径。 -- 对已经初始化的 catalog,修改统一键 `meta.cache.hive.*` 时,当前版本不会完整热更新已有 Hive cache entry。要确保新配置生效,需要重建 catalog 或重启 FE。 +在 4.1.x 之前,Hive 元数据缓存一部分通过 Catalog 兼容属性控制,一部分仍受 FE 全局缓存参数控制,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议显式改写为 `meta.cache.hive.*`,不要继续沿用旧键名。 -示例: +| 4.1 前属性键 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.schema.ttl-second` | 仅对应 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`;`enable` 和 `capacity` 需按需单独配置。 | +| `partition.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.partition_values.ttl-second` | 仅对应分区列表新鲜度。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | +| `file.meta.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.file.ttl-second` | 仅对应文件列表新鲜度。若希望新增/删除文件每次查询立即可见,设置为 `0`。 | -```sql -ALTER CATALOG hive_ctl SET PROPERTIES ( - "meta.cache.hive.file.ttl-second" = "0" -); -``` +`meta.cache.hive.partition.*` 是 4.1.x 中单独可调的新模块,4.1 前没有一一对应的 Catalog 级 TTL 键。若您关心分区 Location、Serde、InputFormat 等属性变更的可见性,需要在升级后单独设置它。 -### 可观测性 {#meta-cache-unified-observability} +4.1.x 的统一模型把每个缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧键只表达 TTL,不表达是否启用和容量上限。升级后如果只完成键名替换而不评估 `enable/capacity`,则其余行为会落到 4.1.x 的默认值。 -Hive 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 -系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 +### 最佳实践 {#meta-cache-best-practices} -Hive 常见 entry: +* **实时查看最新数据**:如果您希望每次查询都能看到外部数据源的最新分区或文件变动,可以将对应的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭文件列表缓存,实时看到文件变动 + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.file.ttl-second" = "0"); + -- 关闭分区值缓存,实时看到新增分区 + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.partition_values.ttl-second" = "0"); + ``` +* **性能优化**:对于元数据变动不频繁的场景,建议适当增大 `capacity` 和 `ttl-second` 以减少对 Hive Metastore 和文件系统的访问压力。 -| Entry | 含义 | -|---|---| -| `schema` | Schema cache entry | -| `partition_values` | 分区名称 / 分区值缓存 entry | -| `partition` | 分区属性缓存 entry | -| `file` | 文件列表缓存 entry | +:::caution +**Hive Catalog 注意事项**:Hive 的 `meta.cache.hive.*` 属性修改**不支持热生效**。修改配置后,必须重建 Catalog 或重启 FE 节点才能应用新的缓存配置。 +::: + +### 可观测性 {#meta-cache-unified-observability} -示例查询: +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'hive_ctl' - AND engine_name = 'hive' +WHERE catalog_name = 'hive_ctl' AND engine_name = 'hive' ORDER BY entry_name; ``` +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Hive 版本 支持 Hive 1.x,2.x,3.x,4.x。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md index 98e151ed3646b..d70163d1eb720 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md @@ -51,61 +51,74 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | 是否使用 Hive Metastore 已同步的分区信息。如果为 true,则会直接从 Hive Metastore 中获取分区信息。否则,会从文件系统的元数据文件中获取分区信息。通过 Hive Metastore 获取信息性能更好,但需要用户保证最新的元数据已经同步到了 Hive Metastore。 | false | -## 元数据缓存(4.1.x+) {#meta-cache-unified} +## 元数据缓存 {#meta-cache} +为了提升访问外部数据源的性能,Apache Doris 会对 Hudi 的元数据进行缓存。元数据包括表结构(Schema)、分区信息、FS View 和 Meta Client 对象等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 从 Doris 4.1.x 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 -本节说明 Hudi 相关 cache 模块的配置与观测方式。 +::: -统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} -### 缓存模块 {#meta-cache-unified-modules} +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 -| 模块 | 属性键前缀 | 典型缓存内容 | +| 属性 | 示例 | 含义 | |---|---|---| -| `schema` | `meta.cache.hudi.schema.` | 表 schema 加载对应的 schema cache entry。 | -| `partition` | `meta.cache.hudi.partition.` | Hudi 分区相关元数据(用于分区发现/剪枝等)。 | -| `fs_view` | `meta.cache.hudi.fs_view.` | Hudi FS View 相关元数据。 | -| `meta_client` | `meta.cache.hudi.meta_client.` | Hudi Meta Client 相关元数据。 | +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | -说明: +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 -- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 -- 如果 Hudi 表是通过 HMS catalog 提供访问的,`meta.cache.hudi.*` 也配置在该 HMS catalog 上。 +### 缓存模块 {#meta-cache-unified-modules} -示例: +Hudi Catalog 包含以下缓存模块: -```sql -ALTER CATALOG hudi_ctl SET PROPERTIES ( - "meta.cache.hudi.fs_view.capacity" = "2000" -); -``` +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.hudi.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition` | `meta.cache.hudi.partition.` | 缓存 Hudi 分区相关元数据。影响:分区发现、分区裁剪,以及新增/删除分区何时在 Doris 中可见。 | +| `fs_view` | `meta.cache.hudi.fs_view.` | 缓存 Hudi 文件系统视图相关元数据。影响:查询规划时选择到的最新 base file / log file 以及 file slice 视图的新鲜度。 | +| `meta_client` | `meta.cache.hudi.meta_client.` | 缓存 Hudi Meta Client 对象。影响:时间线(timeline)、表配置等底层元数据重新加载的频率,以及提交/表配置变更何时被感知。 | -### 可观测性 {#meta-cache-unified-observability} +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Hudi 的 Schema 缓存有 Catalog 兼容属性,分区与表级元数据则主要遵循旧的 FE 全局缓存模型,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议统一改写为 `meta.cache.hudi.*`,并分别配置分区、FS View 和 Meta Client。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Hudi Catalog 兼容属性 | `meta.cache.hudi.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Hudi 分区旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档) | `meta.cache.hudi.partition.ttl-second` | 控制分区发现与分区可见性。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | +| 无一一对应的旧 Catalog 键 | 4.1 前未单独暴露 `fs_view` / `meta_client` TTL | `meta.cache.hudi.fs_view.*`、`meta.cache.hudi.meta_client.*` | 这是 4.1.x 中拆分出的新模块。若希望更快感知最新 file slice 或提交时间线,分别调低对应 `ttl-second`。 | -Hudi 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 -系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要描述 TTL/全局缓存行为。升级后如果仍沿用旧理解,容易遗漏 `fs_view`、`meta_client` 这类新模块的单独配置。 -Hudi 常见 entry: +### 最佳实践 {#meta-cache-best-practices} -| Entry | 含义 | -|---|---| -| `schema` | Schema cache entry | -| `partition` | 分区元数据缓存 entry | -| `fs_view` | FS View 缓存 entry | -| `meta_client` | Meta Client 缓存 entry | +* **实时查看最新数据**:如果您希望每次查询都能看到 Hudi 表的最新数据变动或 Schema 变更,可以将 `schema` 或 `partition` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭分区元数据缓存,以感知 Hudi 表的最新分区变动 + ALTER CATALOG hudi_ctl SET PROPERTIES ("meta.cache.hudi.partition.ttl-second" = "0"); + ``` +* **性能优化**:`ALTER CATALOG ... SET PROPERTIES` 的修改在 Hudi 中支持热生效(通过 HMS catalog 属性更新路径)。 -示例查询: +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'hudi_ctl' - AND engine_name = 'hudi' +WHERE catalog_name = 'hudi_ctl' AND engine_name = 'hudi' ORDER BY entry_name; ``` +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Hudi 版本 当前依赖的 Hudi 版本为 0.15。推荐访问 0.14 版本以上的 Hudi 数据。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx index d7b1c05c92956..2b7ee3b809cf3 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx @@ -87,64 +87,82 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 -## 元数据缓存(4.1.x+) {#meta-cache-unified} +## 元数据缓存 {#meta-cache} +为了提升访问外部数据源的性能,Apache Doris 会对 Iceberg 的元数据进行缓存。元数据包括表结构(Schema)、表对象、View 对象和 Manifest 详情等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 从 Doris 4.1.x 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 -本节说明 Iceberg 相关 cache 模块的配置与观测方式。 +::: -统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} -### 缓存模块 {#meta-cache-unified-modules} +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 -| 模块 | 属性键前缀 | 典型缓存内容 | +| 属性 | 示例 | 含义 | |---|---|---| -| `schema` | `meta.cache.iceberg.schema.` | 表 schema 加载对应的 schema cache entry。 | -| `table` | `meta.cache.iceberg.table.` | Iceberg 表元数据对象(减少 catalog/metastore 往返)。 | -| `view` | `meta.cache.iceberg.view.` | Iceberg View 元数据对象。 | -| `manifest` | `meta.cache.iceberg.manifest.` | manifest 相关元数据(减少重复读取 manifest 的开销)。 | +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | -说明: +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 -- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 -- 当前实现中,`manifest` 默认是关闭的,调 TTL / capacity 之前要先显式打开。 -- `view` entry 只有在 Doris 访问 Iceberg View 时才会出现。 -- `ALTER CATALOG ... SET PROPERTIES` 的修改通过统一热生效路径应用。 +### 缓存模块 {#meta-cache-unified-modules} -示例: +Iceberg Catalog 包含以下缓存模块: -```sql -ALTER CATALOG iceberg_ctl SET PROPERTIES ( - "meta.cache.iceberg.manifest.enable" = "true", - "meta.cache.iceberg.manifest.ttl-second" = "600" -); -``` +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `table` | `meta.cache.iceberg.table.` | 缓存 Iceberg 表元数据对象。影响:最新 Snapshot、Partition Spec、Sort Order、表属性等表级元数据在 Doris 中的可见性;若关闭,每次规划都会重新加载表元数据。 | +| `view` | `meta.cache.iceberg.view.` | 缓存 Iceberg View 元数据对象。影响:View 定义、Schema、属性变更在 Doris 中的可见性。 | +| `manifest` | `meta.cache.iceberg.manifest.` | 缓存 Manifest 详情。主要影响查询规划时重复读取 Manifest 文件的开销,通常不直接决定表或 Snapshot 是否可见。注意:该模块默认关闭,需手动启用。 | -### 可观测性 {#meta-cache-unified-observability} +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Iceberg 表级元数据主要受 FE 全局缓存策略控制,`schema.cache.ttl-second` 是常见的 Catalog 兼容属性;详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议改写为 `meta.cache.iceberg.*`,并按需要分别配置表、View 和 Manifest 缓存。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Iceberg Catalog 兼容属性 | `meta.cache.iceberg.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Iceberg 表信息旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档) | `meta.cache.iceberg.table.ttl-second` | 控制表级元数据新鲜度。若希望每次查询都读取最新 Snapshot/表属性,设置为 `0`。 | +| 无一一对应的旧 Catalog 键 | 4.1 前未单独暴露 View / Manifest TTL | `meta.cache.iceberg.view.*`、`meta.cache.iceberg.manifest.*` | 这是 4.1.x 中拆分出的新模块。升级后如需保证最新 View 定义,单独调低 `view.ttl-second`;`manifest` 主要用于性能优化。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要描述 TTL/全局缓存行为,不覆盖这些新模块的独立开关和容量上限。升级时建议一并评估是否需要补充 `enable/capacity`。 -Iceberg 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 -系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 +### 最佳实践 {#meta-cache-best-practices} -Iceberg 常见 entry: +* **实时查看最新数据**:如果您希望每次查询都能看到 Iceberg 表的最新快照或 Schema 变动,可以将 `schema` 或 `table` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭表对象缓存,以便感知快照变动 + ALTER CATALOG iceberg_ctl SET PROPERTIES ("meta.cache.iceberg.table.ttl-second" = "0"); + ``` +* **性能优化**: + * 启用 Manifest 缓存可以显著提升大表的查询规划速度: + ```sql + ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", + "meta.cache.iceberg.manifest.ttl-second" = "600" + ); + ``` + * `ALTER CATALOG ... SET PROPERTIES` 的修改在 Iceberg Catalog 中支持热生效。 -| Entry | 含义 | -|---|---| -| `schema` | Schema cache entry | -| `table` | 表元数据缓存 entry | -| `view` | View 元数据缓存 entry | -| `manifest` | Manifest payload 缓存 entry | +### 可观测性 {#meta-cache-unified-observability} -示例查询: +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'iceberg_ctl' - AND engine_name = 'iceberg' +WHERE catalog_name = 'iceberg_ctl' AND engine_name = 'iceberg' ORDER BY entry_name; ``` +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Iceberg 版本 | Doris 版本 | Iceberg SDK 版本 | diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md index 9fcbc9b98277a..9bf2c67449187 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md @@ -111,51 +111,71 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中「通用属性」部分。 -## 元数据缓存(4.1.x+) {#meta-cache-unified} +## 元数据缓存 {#meta-cache} +为了提升访问外部数据源的性能,Apache Doris 会对 MaxCompute 的元数据进行缓存。元数据包括表结构(Schema)和分区列表等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 从 Doris 4.1.x 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 -本节说明 MaxCompute 相关 cache 模块的配置与观测方式。 +::: -统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 ### 缓存模块 {#meta-cache-unified-modules} -| 模块 | 属性键前缀 | 典型缓存内容 | +MaxCompute Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | |---|---|---| -| `schema` | `meta.cache.maxcompute.schema.` | 表 schema 加载对应的 schema cache entry。 | -| `partition_values` | `meta.cache.maxcompute.partition_values.` | 分区值缓存 entry,用于分区剪枝与分区枚举。 | +| `schema` | `meta.cache.maxcompute.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | 缓存分区值列表。影响:分区裁剪、分区枚举,以及新增/删除分区何时在 Doris 中可见。若关闭,可实时查看到分区变动。 | -说明: +### 旧参数映射与转换 {#meta-cache-mapping} -- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 -- `partition_values` 通过 `meta.cache.maxcompute.partition_values.*` 配置。 -- stats 表里能直接看到的 MaxCompute entry 只有 `partition_values` 和 `schema`。 -- `meta.cache.maxcompute.*` 目前没有专门的热生效 hook。 +在 4.1.x 之前,MaxCompute 的 Schema 和分区相关缓存主要通过 Catalog 兼容属性或 FE 全局缓存策略控制。升级到 4.1.x 后,建议统一改写为 `meta.cache.maxcompute.*`。 -### 可观测性 {#meta-cache-unified-observability} +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 MaxCompute Catalog 兼容属性 | `meta.cache.maxcompute.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| MaxCompute 分区值旧模型 | 4.1 前 FE 全局缓存策略 | `meta.cache.maxcompute.partition_values.ttl-second` | 控制分区枚举与分区可见性。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要表达 TTL/全局缓存行为。升级后如果只迁移 TTL 而不评估 `enable/capacity`,其余行为会使用 4.1.x 的默认值。 -MaxCompute 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 -系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 +### 最佳实践 {#meta-cache-best-practices} -MaxCompute 常见 entry: +* **实时查看最新数据**:如果您希望每次查询都能看到 MaxCompute 表的最新分区变动或 Schema 变更,可以将 `schema` 或 `partition_values` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭分区值缓存,以感知 MaxCompute 表的最新分区 + ALTER CATALOG mc_ctl SET PROPERTIES ("meta.cache.maxcompute.partition_values.ttl-second" = "0"); + ``` +* **注意**:`meta.cache.maxcompute.*` 目前没有专门的热生效 hook。修改配置后,建议重建 Catalog 或重启 FE 以确保生效。 -| Entry | 含义 | -|---|---| -| `schema` | Schema cache entry | -| `partition_values` | 分区值缓存 entry | +### 可观测性 {#meta-cache-unified-observability} -示例查询: +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'mc_ctl' - AND engine_name = 'maxcompute' +WHERE catalog_name = 'mc_ctl' AND engine_name = 'maxcompute' ORDER BY entry_name; ``` +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 MaxCompute 版本 仅支持公有云版本的 MaxCompute。私有云版本支持请联系 Doris 社区支持。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx index d277f5ac1de00..a77565176f6a0 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx @@ -90,58 +90,71 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 -## 元数据缓存(4.1.x+) {#meta-cache-unified} +## 元数据缓存 {#meta-cache} +为了提升访问外部数据源的性能,Apache Doris 会对 Paimon 的元数据进行缓存。元数据包括表结构(Schema)和表对象等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 从 Doris 4.1.x 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 -本节说明 Paimon 相关 cache 模块的配置与观测方式。 +::: -统一属性语义可参阅:[统一外表元数据缓存(4.1.x+)](../meta-cache/unified-meta-cache.md)。 +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} -### 缓存模块 {#meta-cache-unified-modules} +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 -| 模块 | 属性键前缀 | 典型缓存内容 | +| 属性 | 示例 | 含义 | |---|---|---| -| `schema` | `meta.cache.paimon.schema.` | 表 schema 加载对应的 schema cache entry。 | -| `table` | `meta.cache.paimon.table.` | Paimon 表元数据(用于查询规划,实际涉及 schema/snapshot/partition 等元数据加载)。 | +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | -说明: +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 -- 属性键使用上表中的模块名。这些名字也会出现在 `information_schema.catalog_meta_cache_statistics` 的 `ENTRY_NAME` 中。 -- `schema` 和 `table` 是两个独立 entry。若 catalog 未设置 `meta.cache.paimon.schema.*`,`schema` 使用 FE 默认值。 -- `ALTER CATALOG ... SET PROPERTIES` 的修改通过统一热生效路径应用。 +### 缓存模块 {#meta-cache-unified-modules} -示例: +Paimon Catalog 包含以下缓存模块: -```sql -ALTER CATALOG paimon_ctl SET PROPERTIES ( - "meta.cache.paimon.table.ttl-second" = "0" -); -``` +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.paimon.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `table` | `meta.cache.paimon.table.` | 缓存 Paimon 表元数据对象。影响:最新 Snapshot、Schema 演进、分支/标签引用等表级元数据在 Doris 中的可见性,同时减少查询规划时的元数据加载开销。 | -### 可观测性 {#meta-cache-unified-observability} +### 旧参数映射与转换 {#meta-cache-mapping} -Paimon 缓存指标可通过 `information_schema.catalog_meta_cache_statistics` 查询。 -系统表字段与指标说明见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 +在 4.1.x 之前,Paimon 的 Schema 与表级元数据主要遵循旧的 Catalog 兼容属性或 FE 全局缓存模型,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议改写为 `meta.cache.paimon.*`,并单独评估表级缓存是否需要更强的新鲜度。 -Paimon 常见 entry: +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Paimon Catalog 兼容属性 | `meta.cache.paimon.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Paimon 表级旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档中的旧表级元数据模型说明) | `meta.cache.paimon.table.ttl-second` | 控制最新 Snapshot/表级元数据的可见性。若希望每次查询都读取最新表快照,设置为 `0`。 | -| Entry | 含义 | -|---|---| -| `schema` | Schema cache entry | -| `table` | 表元数据缓存 entry | +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要表达 TTL/全局缓存行为。升级后建议不要只替换键名,还要同时评估是否需要显式配置 `enable/capacity`。 -示例查询: +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Paimon 表的最新数据变动或 Schema 变更,可以将 `schema` 或 `table` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭表对象缓存,以感知 Paimon 表的最新快照 + ALTER CATALOG paimon_ctl SET PROPERTIES ("meta.cache.paimon.table.ttl-second" = "0"); + ``` +* **性能优化**:`ALTER CATALOG ... SET PROPERTIES` 的修改在 Paimon Catalog 中支持热生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: ```sql SELECT catalog_name, engine_name, entry_name, effective_enabled, ttl_second, capacity, estimated_size, hit_rate, load_failure_count, last_error FROM information_schema.catalog_meta_cache_statistics -WHERE catalog_name = 'paimon_ctl' - AND engine_name = 'paimon' +WHERE catalog_name = 'paimon_ctl' AND engine_name = 'paimon' ORDER BY entry_name; ``` +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Paimon 版本 当前依赖的 Paimon 版本为 1.0.0。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md index 506f5f3031833..cf6145a5925a9 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md @@ -6,32 +6,19 @@ } --- -为了提升访问外部数据源的性能,Apache Doris 会对外部数据源的**元数据**进行缓存。 - -元数据包括库、表、列信息、分区信息、快照信息、文件列表等。 - -本文详细介绍缓存的元数据的种类、策略和相关参数配置。 - -关于**数据缓存**,可参阅[数据缓存文档](./data-cache.md)。 - :::tip -该文档适用于 2.1.6 之后的版本。 +该文档主要适用于 Doris 4.1.x 之前的版本。 +对于 Doris 4.1.x 及之后版本,外表元数据缓存已重构并使用统一配置键 `meta.cache.*`,请直接参阅各 [Catalog](./catalog-overview.md) 文档中的“元数据缓存”章节。 +如果您正在从 4.1.x 之前的版本升级,请以各 Catalog 页中的“旧参数映射与转换”为准,将旧参数改写为 `meta.cache.*` 统一键。 ::: -:::note -对于 Doris 4.1.x 及之后版本,外表元数据缓存已重构并使用统一配置键 `meta.cache.*`。 -请参阅[统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 - -从 Doris 4.1.x 开始,外表元数据缓存可以分成两层来理解: +为了提升访问外部数据源的性能,Apache Doris 会对外部数据源的**元数据**进行缓存。 -- 通用 catalog 缓存:库/表名称列表、库/表对象等,仍由 `max_meta_object_cache_num`、`external_cache_refresh_time_minutes`、`external_cache_expire_time_seconds_after_access` 等 FE 配置控制。 -- 引擎特定 entry 缓存:schema、分区元数据、manifest、文件列表等,这些按 catalog 使用统一键 `meta.cache...{enable,ttl-second,capacity}` 配置。 +元数据包括库、表、列信息、分区信息、快照信息、文件列表等。 -统一文档主要描述第二层。 -::: +本文详细介绍旧版本(pre-4.1)中缓存的元数据的种类、策略和相关参数配置。 -本文主体主要记录 2.1.x / 3.x 旧缓存模型中的 FE 默认值与兼容参数。 -对于 Doris 4.1.x+ 的当前引擎级 cache entry,请直接阅读统一页和各 Catalog 文档。 +关于**数据缓存**,可参阅[数据缓存文档](./data-cache.md)。 ## 缓存策略 @@ -220,7 +207,7 @@ ### Hudi 表分区 这里描述的是 Hudi 分区元数据缓存的旧模型摘要。 -对于 Doris 4.1.x+ 的当前 Hudi cache entry(如 `fs_view`、`meta_client`),请参阅 [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache-unified)。 +对于 Doris 4.1.x+ 的当前 Hudi cache entry(如 `fs_view`、`meta_client`),请参阅 [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache)。 该缓存,每个 Hudi Catalog 有一个。 @@ -243,7 +230,7 @@ ### Iceberg 表信息 这里描述的是 Iceberg 表元数据缓存的旧模型摘要。表对象通过 Iceberg API 加载并构建。 -对于 Doris 4.1.x+ 的当前可观测 cache entry,请参阅 [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache-unified)。 +对于 Doris 4.1.x+ 的当前可观测 cache entry,请参阅 [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache)。 该缓存,每个 Iceberg Catalog 有一个。 @@ -332,12 +319,6 @@ CREATE CATALOG hive PROPERTIES ( 对于所有类型的 External Catalog,如果希望实时可见最新的 Table Schema,可以关闭 Schema Cache: -:::note -对于 Doris 4.1.x+,推荐使用统一键 `meta.cache..schema.ttl-second = "0"`。 -详细说明请参阅: -[统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 -::: - - 全局关闭 ```text @@ -345,13 +326,6 @@ CREATE CATALOG hive PROPERTIES ( max_external_schema_cache_num=0 // 关闭 Schema 缓存。 ``` -- Doris 4.1.x+ 的 catalog 级关闭方式 - - ```text - -- Catalog property - "meta.cache..schema.ttl-second" = "0" - ``` - - 旧的 catalog 级兼容参数 ```text @@ -365,12 +339,6 @@ CREATE CATALOG hive PROPERTIES ( 针对 Hive Catalog,如果想关闭缓存来查询到实时更新的数据,可以配置以下参数: -:::note -对于 Doris 4.1.x+,推荐优先使用统一键 `meta.cache.hive.*`,并参考: -[Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache-unified) 与 -[统一外表元数据缓存(4.1.x+)](./meta-cache/unified-meta-cache.md)。 -::: - - 全局关闭 ```text @@ -380,15 +348,6 @@ CREATE CATALOG hive PROPERTIES ( max_hive_partition_cache_num=0 // 关闭分区属性缓存 ``` -- Doris 4.1.x+ 的 catalog 级关闭方式 - - ```text - -- Catalog property - "meta.cache.hive.partition_values.ttl-second" = "0" // 关闭分区列表缓存 - "meta.cache.hive.partition.ttl-second" = "0" // 关闭分区属性缓存 - "meta.cache.hive.file.ttl-second" = "0" // 关闭文件列表缓存 - ``` - - 旧的 catalog 级兼容参数 ```text diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md deleted file mode 100644 index a377924c48143..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache/unified-meta-cache.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -{ - "title": "统一外表元数据缓存(4.1.x+)", - "language": "zh-CN", - "description": "面向用户的统一外表元数据缓存使用说明:统一配置键 meta.cache.*、缓存覆盖范围、以及各类 Catalog 的配置入口。" -} ---- - -从 **Doris 4.1.x** 开始,External Catalog 的外表元数据缓存能力进行了统一化重构。统一外表元数据缓存标准化了不同数据湖引擎(如 Hive, Iceberg 等)的缓存配置模型与监控体系,降低了多数据源管理的配置门槛和排障难度。 - -对用户来说,主要关注三件事: - -- **影响哪些内容**:取决于不同 catalog 引擎(分区信息、文件列表、表元数据、manifest 等)。 -- **在哪里配置**:在 Catalog `PROPERTIES` 里使用统一键 `meta.cache.*`(具体 entry 见下方各 catalog 文档)。 -- **如何观测**:通过 `information_schema.catalog_meta_cache_statistics` 系统表查看指标(见本文观测章节)。 - -:::tip -适用于 Doris 4.1.x 及之后版本。 -::: - -## 外表 Meta Cache 覆盖范围 - -在学习如何配置之前,首先需要明确该缓存覆盖的范围。这里有两层缓存比较容易混淆: - -- **Catalog 对象/名称缓存**:如 `SHOW DATABASES`、`SHOW TABLES`、库对象、表对象等,这部分属于通用缓存,详情见 [元数据缓存](../meta-cache.md)。 -- **引擎 entry 缓存**:各引擎特有的运行时元数据,如 Hive 分区/文件、Iceberg manifest、Paimon table handle、schema entry 等。**本文主要针对这一层进行说明。** - -外表元数据 cache entry 覆盖多种元数据类型。其中一部分由统一 `meta.cache.*` 键配置,另一部分同时继承 FE 级默认值: - -| 类别 | 示例 | 配置方式 | -|---|---|---| -| 引擎 entry 缓存 | Hive `partition_values` / `partition` / `file`、Iceberg `manifest`、Paimon `table` 等 | Catalog `PROPERTIES`:`meta.cache...*` | -| Schema cache | 各引擎自己的 `schema` entry,按 schema version token 隔离 | FE 配置提供默认值,Catalog `meta.cache..schema.*` 可覆盖 | - -## 统一属性模型 - -各引擎 cache entry 使用统一的配置键格式: - -`meta.cache...{enable,ttl-second,capacity}` - -下表说明属性语义: - -| 属性 | 示例 | 含义 | -|---|---|---| -| `enable` | `true/false` | 是否启用该缓存 entry。 | -| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭;`-1` 表示永不过期;其他值表示按访问时间计算 TTL。 | -| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | - -**生效逻辑说明:** -只有当 `enable=true` 且 `ttl-second > 0`(或为 -1)且 `capacity > 0` 时,该模块缓存才会真正生效(对应观测表中的 `EFFECTIVE_ENABLED = true`)。 - -说明: - -- `` 使用 catalog 文档和 stats 表中展示的 cache entry 名,例如 `partition_values`、`fs_view`、`meta_client`。 -- 当前没有 per-entry 的刷新周期参数。异步刷新周期仍由 FE 配置 `external_cache_refresh_time_minutes` 统一控制。 - -示例: - -```sql -ALTER CATALOG hive_ctl SET PROPERTIES ( - -- 将 Hive 的文件列表缓存 TTL 设置为 0,即刻关闭该缓存 - "meta.cache.hive.file.ttl-second" = "0" -); -``` - -## 支持矩阵与配置入口 - -下面的表总结了各引擎支持的缓存项,以及是否支持热生效,并提供了具体配置说明的链接: - -| Catalog 引擎 | 在 stats 表里能看到的 entry (``) | `ALTER CATALOG ... SET PROPERTIES` 热生效 | 详细配置说明链接 | -|---|---|---|---| -| Hive | `schema`、`partition_values`、`partition`、`file` | `meta.cache.hive.*` 的变更不会通过统一热生效路径应用;需重建 catalog 或重启 FE 后生效 | [Hive Catalog](../catalogs/hive-catalog.mdx#meta-cache-unified) | -| Iceberg | `schema`、`table`、`view`、`manifest` | 支持 | [Iceberg Catalog](../catalogs/iceberg-catalog.mdx#meta-cache-unified) | -| Paimon | `schema`、`table` | 支持 | [Paimon Catalog](../catalogs/paimon-catalog.mdx#meta-cache-unified) | -| Hudi | `schema`、`partition`、`fs_view`、`meta_client` | 支持,通过 HMS catalog 属性更新路径生效 | [Hudi Catalog](../catalogs/hudi-catalog.md#meta-cache-unified) | -| MaxCompute | `schema`、`partition_values` | 没有专门的热生效 hook | [MaxCompute Catalog](../catalogs/maxcompute-catalog.md#meta-cache-unified) | - -:::caution -**Hive Catalog 注意事项**:Hive 的 `meta.cache.hive.*` 属性修改**不支持热生效**。修改配置后,必须重建 Catalog 或重启 FE 节点才能应用新的缓存配置。 -::: - -## 观测方式 - -通过系统表统一观测缓存指标: - -```sql -SELECT catalog_name, engine_name, entry_name, - effective_enabled, ttl_second, capacity, - estimated_size, hit_rate, load_failure_count, last_error -FROM information_schema.catalog_meta_cache_statistics -ORDER BY catalog_name, engine_name, entry_name; -``` - -该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 - -可以这样理解这些字段: - -| 内容 | 说明 | -|---|---| -| `ENGINE_NAME` | 缓存引擎,如 `hive`、`iceberg` | -| `ENTRY_NAME` | 该引擎下的精确 entry 名,如 `partition_values`、`fs_view`、`manifest` | -| `EFFECTIVE_ENABLED` | 综合 `enable`、`ttl-second`、`capacity` 后最终是否生效 | -| `LOAD_FAILURE_COUNT` | 从外部系统加载数据的失败次数。当查询变慢或报错时,可优先查看此字段排查上游系统异常。 | -| `LAST_ERROR` | 最后一次加载失败的错误信息。对排查 HMS、S3 等超时或连接异常极其有用。 | - -常见查询方式是按 `catalog_name` 和 `engine_name` 过滤。该系统表不再使用旧的 `cache_name` / `metric_name` 透视模型。 - -## 旧参数迁移说明 - -从 Doris 4.1.x 开始,旧版 catalog cache 参数(例如 `schema.cache.ttl-second`、`file.meta.cache.ttl-second`)已不再推荐使用。请改用 `meta.cache.*` 统一键,并参考上文对应的 catalog 文档。 From afc9eed0f78f920f4e38c4ee49dc9792c10bfc83 Mon Sep 17 00:00:00 2001 From: "Mingyu Chen (Rayner)" Date: Mon, 30 Mar 2026 15:48:07 -0700 Subject: [PATCH 6/8] opt 2 --- docs/lakehouse/catalog-overview.md | 14 ++- docs/lakehouse/catalogs/hive-catalog.mdx | 2 +- docs/lakehouse/catalogs/hudi-catalog.md | 2 +- docs/lakehouse/catalogs/iceberg-catalog.mdx | 2 +- docs/lakehouse/catalogs/maxcompute-catalog.md | 2 +- docs/lakehouse/catalogs/paimon-catalog.mdx | 2 +- .../current/lakehouse/catalog-overview.md | 17 ++- .../lakehouse/catalogs/hive-catalog.mdx | 2 +- .../lakehouse/catalogs/hudi-catalog.md | 2 +- .../lakehouse/catalogs/iceberg-catalog.mdx | 2 +- .../lakehouse/catalogs/maxcompute-catalog.md | 2 +- .../lakehouse/catalogs/paimon-catalog.mdx | 2 +- .../lakehouse/catalogs/hive-catalog.mdx | 94 ++++++++++++++- .../lakehouse/catalogs/hudi-catalog.md | 69 ++++++++++- .../lakehouse/catalogs/iceberg-catalog.mdx | 76 ++++++++++++ .../lakehouse/catalogs/maxcompute-catalog.md | 65 +++++++++++ .../lakehouse/catalogs/paimon-catalog.mdx | 84 +++++++++++++ .../version-4.x/lakehouse/catalog-overview.md | 18 ++- .../lakehouse/catalogs/hive-catalog.mdx | 110 ++++++++++++++++-- .../lakehouse/catalogs/hudi-catalog.md | 69 ++++++++++- .../lakehouse/catalogs/iceberg-catalog.mdx | 76 ++++++++++++ .../lakehouse/catalogs/maxcompute-catalog.md | 65 +++++++++++ .../lakehouse/catalogs/paimon-catalog.mdx | 90 +++++++++++++- .../lakehouse/catalogs/hive-catalog.mdx | 88 ++++++++++++++ .../lakehouse/catalogs/hudi-catalog.md | 64 ++++++++++ .../lakehouse/catalogs/iceberg-catalog.mdx | 72 ++++++++++++ .../lakehouse/catalogs/maxcompute-catalog.md | 62 ++++++++++ .../lakehouse/catalogs/paimon-catalog.mdx | 81 +++++++++++++ .../version-4.x/lakehouse/catalog-overview.md | 14 ++- .../lakehouse/catalogs/hive-catalog.mdx | 88 ++++++++++++++ .../lakehouse/catalogs/hudi-catalog.md | 64 ++++++++++ .../lakehouse/catalogs/iceberg-catalog.mdx | 72 ++++++++++++ .../lakehouse/catalogs/maxcompute-catalog.md | 62 ++++++++++ .../lakehouse/catalogs/paimon-catalog.mdx | 81 +++++++++++++ 34 files changed, 1567 insertions(+), 48 deletions(-) diff --git a/docs/lakehouse/catalog-overview.md b/docs/lakehouse/catalog-overview.md index 2051135a0f283..0e798e72a00ee 100644 --- a/docs/lakehouse/catalog-overview.md +++ b/docs/lakehouse/catalog-overview.md @@ -83,7 +83,7 @@ Behavior after setting: This property can be used in combination with `include_database_list`. For example, first filter the required databases using `include_database_list`, then further specify the required tables using `include_table_list`. ::: -### Table Name Case Sensitivity +### Table Name Case Sensitivity {#table-name-case-sensitivity-lower_case_table_names} This feature is supported since version 4.1.0. @@ -109,7 +109,7 @@ CREATE CATALOG hive_catalog PROPERTIES ( When `lower_case_table_names` is set to `1` or `2`, if tables with names that differ only in case exist in the remote metadata (such as `MyTable` and `mytable`), conflicts may occur. Doris will detect such conflicts and report an error. ::: -### Database Name Case Sensitivity +### Database Name Case Sensitivity {#database-name-case-sensitivity-lower_case_database_names} This feature is supported since version 4.1.0. @@ -243,7 +243,7 @@ Note 1: If a data catalog is explicitly specified in the MySQL command line or J Note 2: If the data catalog set by the user property `default_init_catalog` no longer exists, the session will automatically switch to the default `internal` data catalog. -Note 3: This feature is available starting from version 3.1.x. +Note 3: This feature is supported since version 3.1.x. ### Simple Queries @@ -324,7 +324,13 @@ REFRESH TABLE catalog_name.db_name.table_name; Doris also supports disabling metadata caching to enable real-time access to the latest metadata. -For detailed information and configuration of metadata caching, please refer to: [Metadata Cache](./meta-cache.md) +- Before Doris 4.1.x: please refer to [Metadata Cache](./meta-cache.md). +- Doris 4.1.x and later: please refer to the "Metadata Cache" section in each Catalog documentation. + - [Hive Catalog](./catalogs/hive-catalog.md#meta-cache) + - [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache) + - [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache) + - [Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache) + - [MaxCompute Catalog](./catalogs/maxcompute-catalog.md#meta-cache) ## Modifying Data Catalogs diff --git a/docs/lakehouse/catalogs/hive-catalog.mdx b/docs/lakehouse/catalogs/hive-catalog.mdx index a7bf9c4829d1a..c203e44e468dd 100644 --- a/docs/lakehouse/catalogs/hive-catalog.mdx +++ b/docs/lakehouse/catalogs/hive-catalog.mdx @@ -85,7 +85,7 @@ For versions before Doris 4.1.x, metadata caching is mainly controlled globally Starting from Doris 4.1.x, Hive Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. ::: -### Unified Property Model (4.1.x+) {#meta-cache-unified-model} +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. diff --git a/docs/lakehouse/catalogs/hudi-catalog.md b/docs/lakehouse/catalogs/hudi-catalog.md index 7ac444f9b5f1f..b476e54d67bf3 100644 --- a/docs/lakehouse/catalogs/hudi-catalog.md +++ b/docs/lakehouse/catalogs/hudi-catalog.md @@ -60,7 +60,7 @@ For versions before Doris 4.1.x, metadata caching is mainly controlled globally Starting from Doris 4.1.x, Hudi-related external metadata cache is configured using the unified `meta.cache.*` keys. ::: -### Unified Property Model (4.1.x+) {#meta-cache-unified-model} +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. diff --git a/docs/lakehouse/catalogs/iceberg-catalog.mdx b/docs/lakehouse/catalogs/iceberg-catalog.mdx index 13a04898323a3..418497db93327 100644 --- a/docs/lakehouse/catalogs/iceberg-catalog.mdx +++ b/docs/lakehouse/catalogs/iceberg-catalog.mdx @@ -94,7 +94,7 @@ For versions before Doris 4.1.x, metadata caching is mainly controlled globally Starting from Doris 4.1.x, Iceberg Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. ::: -### Unified Property Model (4.1.x+) {#meta-cache-unified-model} +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. diff --git a/docs/lakehouse/catalogs/maxcompute-catalog.md b/docs/lakehouse/catalogs/maxcompute-catalog.md index 4e772ed0186e8..4cf40f7f35633 100644 --- a/docs/lakehouse/catalogs/maxcompute-catalog.md +++ b/docs/lakehouse/catalogs/maxcompute-catalog.md @@ -120,7 +120,7 @@ For versions before Doris 4.1.x, metadata caching is mainly controlled globally Starting from Doris 4.1.x, MaxCompute Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. ::: -### Unified Property Model (4.1.x+) {#meta-cache-unified-model} +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. diff --git a/docs/lakehouse/catalogs/paimon-catalog.mdx b/docs/lakehouse/catalogs/paimon-catalog.mdx index 5f95428628772..087bc4dcd5c71 100644 --- a/docs/lakehouse/catalogs/paimon-catalog.mdx +++ b/docs/lakehouse/catalogs/paimon-catalog.mdx @@ -99,7 +99,7 @@ For versions before Doris 4.1.x, metadata caching is mainly controlled globally Starting from Doris 4.1.x, Paimon Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. ::: -### Unified Property Model (4.1.x+) {#meta-cache-unified-model} +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md index a071dbd83d2c8..713fda0620664 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md @@ -53,8 +53,8 @@ CREATE CATALOG iceberg_catalog PROPERTIES ( | `include_database_list` | 支持只同步指定的多个 Database,以 `,` 分隔。默认同步所有 Database。Database 名称是大小写敏感的。当外部数据源有大量 Database,但仅需访问个别 Database 时,可以使用此参数,避免大量的元数据同步。 | `'include_database_list' = 'db1,db2'` | | `exclude_database_list` | 支持指定不需要同步的多个 Database,以 `,` 分隔。默认不做任何过滤,同步所有 Database。Database 名称是大小写敏感的。适用场景同上,反向排除不需要访问的数据库。如果冲突,`exclude` 优先级高于 `include`。 | `'exclude_database_list' = 'db1,db2'` | | `include_table_list` | 支持只同步指定的多个表,以 `db.tbl` 格式指定,多个表之间以 `,` 分隔。设置后,列举某个 Database 下的表时将仅返回指定的表,而不会从远端元数据服务获取完整的表列表。适用于外部数据源表数量庞大、获取全量表列表可能超时的场景。 | `'include_table_list' = 'db1.tbl1,db1.tbl2,db2.tbl3'` | -| `lower_case_table_names` | Catalog 级别的表名大小写控制。取值及含义见下方 [表名大小写](#表名大小写lower_case_table_names) 小节。默认值继承全局变量 `lower_case_table_names` 的设置。 | `'lower_case_table_names' = '1'` | -| `lower_case_database_names` | Catalog 级别的数据库名大小写控制。取值及含义见下方 [数据库名大小写](#数据库名大小写lower_case_database_names) 小节。默认值为 `0`(大小写敏感)。 | `'lower_case_database_names' = '2'` | +| `lower_case_table_names` | Catalog 级别的表名大小写控制。取值及含义见下方[表名大小写](#表名大小写lower_case_table_names)小节。默认值继承全局变量 `lower_case_table_names` 的设置。 | `'lower_case_table_names' = '1'` | +| `lower_case_database_names` | Catalog 级别的数据库名大小写控制。取值及含义见下方[数据库名大小写](#数据库名大小写lower_case_database_names)小节。默认值为 `0`(大小写敏感)。 | `'lower_case_database_names' = '2'` | ### 指定表列表 @@ -83,7 +83,7 @@ CREATE CATALOG hive_catalog PROPERTIES ( 此属性可以与 `include_database_list` 配合使用。例如先通过 `include_database_list` 过滤出需要的 Database,再通过 `include_table_list` 进一步精确指定需要的表。 ::: -### 表名大小写 +### 表名大小写{#表名大小写lower_case_table_names} 该功能自 4.1.0 版本起支持。 @@ -109,7 +109,7 @@ CREATE CATALOG hive_catalog PROPERTIES ( 当 `lower_case_table_names` 设置为 `1` 或 `2` 时,如果远端元数据中存在仅大小写不同的同名表(如 `MyTable` 和 `mytable`),可能会导致冲突。Doris 会检测此类冲突并报错。 ::: -### 数据库名大小写 +### 数据库名大小写{#数据库名大小写lower_case_database_names} 该功能自 4.1.0 版本起支持。 @@ -243,7 +243,7 @@ SET PROPERTY default_init_catalog=hive_catalog; 注意 2:如果用户属性 `default_init_catalog` 设置的数据目录已经不存在,则自动切换到默认的 `internal` 数据目录。 -注意 3:该功能从 3.1.x 版本开始生效。 +注意 3:该功能自 3.1.x 版本起支持。 ### 简单查询 @@ -325,7 +325,12 @@ REFRESH TABLE catalog_name.db_name.table_name; Doris 也支持关闭元数据缓存,以便能够实时访问到最新的元数据。 - Doris 4.1.x 之前:请参阅[元数据缓存](./meta-cache.md)。 -- Doris 4.1.x 及之后:请参阅各 Catalog 文档中的“元数据缓存”章节,例如 [Hive Catalog](./catalogs/hive-catalog.md#meta-cache)、[Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache)、[Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache)、[Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache)、[MaxCompute Catalog](./catalogs/maxcompute-catalog.md#meta-cache)。 +- Doris 4.1.x 及之后:请参阅各 Catalog 文档中的“元数据缓存”章节。 + - [Hive Catalog](./catalogs/hive-catalog.md#meta-cache) + - [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache) + - [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache) + - [Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache) + - [MaxCompute Catalog](./catalogs/maxcompute-catalog.md#meta-cache) ## 修改数据目录 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx index 71f0e90a0f389..c1f7fd06362c9 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx @@ -87,7 +87,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( 从 Doris 4.1.x 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 ::: -### 统一属性模型(4.1.x+) {#meta-cache-unified-model} +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} 各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md index d70163d1eb720..11aa62f55092f 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md @@ -60,7 +60,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( 从 Doris 4.1.x 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 ::: -### 统一属性模型(4.1.x+) {#meta-cache-unified-model} +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} 各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx index 2b7ee3b809cf3..2e167dbc9c7e6 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx @@ -96,7 +96,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( 从 Doris 4.1.x 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 ::: -### 统一属性模型(4.1.x+) {#meta-cache-unified-model} +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} 各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md index 9bf2c67449187..0479f4a61cbfb 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md @@ -120,7 +120,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( 从 Doris 4.1.x 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 ::: -### 统一属性模型(4.1.x+) {#meta-cache-unified-model} +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} 各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx index a77565176f6a0..227223021b55f 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx @@ -99,7 +99,7 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( 从 Doris 4.1.x 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 ::: -### 统一属性模型(4.1.x+) {#meta-cache-unified-model} +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} 各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx index 34ec561730e8d..c1f7fd06362c9 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx @@ -78,6 +78,82 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[ 数据目录概述 ](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Hive 的元数据进行缓存。元数据包括表结构(Schema)、分区列表、分区属性和文件列表等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Hive Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.hive.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition_values` | `meta.cache.hive.partition_values.` | 缓存分区值/分区名称列表。影响:`SHOW PARTITIONS`、分区枚举、分区裁剪,以及外部新增/删除分区何时在 Doris 中可见。若关闭,可实时查看到分区变动。 | +| `partition` | `meta.cache.hive.partition.` | 缓存分区属性(Location、InputFormat、Serde 等)。影响:单个分区位置、格式、Serde 等属性变更的可见性。 | +| `file` | `meta.cache.hive.file.` | 缓存文件列表。影响:新增/删除文件、文件大小变化被 Doris 感知的时效性,同时减少远端 LIST 操作开销。若关闭,每次查询都会重新加载文件列表。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Hive 元数据缓存一部分通过 Catalog 兼容属性控制,一部分仍受 FE 全局缓存参数控制,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议显式改写为 `meta.cache.hive.*`,不要继续沿用旧键名。 + +| 4.1 前属性键 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.schema.ttl-second` | 仅对应 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`;`enable` 和 `capacity` 需按需单独配置。 | +| `partition.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.partition_values.ttl-second` | 仅对应分区列表新鲜度。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | +| `file.meta.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.file.ttl-second` | 仅对应文件列表新鲜度。若希望新增/删除文件每次查询立即可见,设置为 `0`。 | + +`meta.cache.hive.partition.*` 是 4.1.x 中单独可调的新模块,4.1 前没有一一对应的 Catalog 级 TTL 键。若您关心分区 Location、Serde、InputFormat 等属性变更的可见性,需要在升级后单独设置它。 + +4.1.x 的统一模型把每个缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧键只表达 TTL,不表达是否启用和容量上限。升级后如果只完成键名替换而不评估 `enable/capacity`,则其余行为会落到 4.1.x 的默认值。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到外部数据源的最新分区或文件变动,可以将对应的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭文件列表缓存,实时看到文件变动 + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.file.ttl-second" = "0"); + -- 关闭分区值缓存,实时看到新增分区 + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.partition_values.ttl-second" = "0"); + ``` +* **性能优化**:对于元数据变动不频繁的场景,建议适当增大 `capacity` 和 `ttl-second` 以减少对 Hive Metastore 和文件系统的访问压力。 + +:::caution +**Hive Catalog 注意事项**:Hive 的 `meta.cache.hive.*` 属性修改**不支持热生效**。修改配置后,必须重建 Catalog 或重启 FE 节点才能应用新的缓存配置。 +::: + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hive_ctl' AND engine_name = 'hive' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Hive 版本 支持 Hive 1.x,2.x,3.x,4.x。 @@ -112,6 +188,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( * [Azure Blob](../storages/azure-blob.md) +* [Apache Ozone](../storages/ozone.md)(自 4.0.4 起支持) + * [阿里云 OSS](../storages/aliyun-oss.md) * [腾讯云 COS](../storages/tencent-cos.md) @@ -370,6 +448,21 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ); ``` + + 自 4.0.4 起支持 + ```sql + CREATE CATALOG `hive_hms_on_ozone_new_catalog` PROPERTIES ( + 'type' = 'hms', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9383', + 'fs.ozone.support' = 'true', + 'ozone.endpoint' = 'http://ozone-s3g:9878', + 'ozone.access_key' = '', + 'ozone.secret_key' = '', + 'ozone.region' = 'us-east-1', + 'ozone.use_path_style' = 'true' + ); + ``` + ```sql CREATE CATALOG test_hive_on_hms_minio_catalog PROPERTIES ( @@ -1101,4 +1194,3 @@ DROP DATABASE [IF EXISTS] hive_ctl.hive_db; | -------- | ------------------------------------ | | 2.1.6 | 支持 Hive 表数据写回 | | 3.0.4 | 支持 JsonSerDe 格式的 Hive 表。支持 Hive4 的事务表。 | - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hudi-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hudi-catalog.md index cb4ac7cc702bc..11aa62f55092f 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hudi-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hudi-catalog.md @@ -51,6 +51,74 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | 是否使用 Hive Metastore 已同步的分区信息。如果为 true,则会直接从 Hive Metastore 中获取分区信息。否则,会从文件系统的元数据文件中获取分区信息。通过 Hive Metastore 获取信息性能更好,但需要用户保证最新的元数据已经同步到了 Hive Metastore。 | false | +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Hudi 的元数据进行缓存。元数据包括表结构(Schema)、分区信息、FS View 和 Meta Client 对象等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Hudi Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.hudi.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition` | `meta.cache.hudi.partition.` | 缓存 Hudi 分区相关元数据。影响:分区发现、分区裁剪,以及新增/删除分区何时在 Doris 中可见。 | +| `fs_view` | `meta.cache.hudi.fs_view.` | 缓存 Hudi 文件系统视图相关元数据。影响:查询规划时选择到的最新 base file / log file 以及 file slice 视图的新鲜度。 | +| `meta_client` | `meta.cache.hudi.meta_client.` | 缓存 Hudi Meta Client 对象。影响:时间线(timeline)、表配置等底层元数据重新加载的频率,以及提交/表配置变更何时被感知。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Hudi 的 Schema 缓存有 Catalog 兼容属性,分区与表级元数据则主要遵循旧的 FE 全局缓存模型,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议统一改写为 `meta.cache.hudi.*`,并分别配置分区、FS View 和 Meta Client。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Hudi Catalog 兼容属性 | `meta.cache.hudi.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Hudi 分区旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档) | `meta.cache.hudi.partition.ttl-second` | 控制分区发现与分区可见性。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | +| 无一一对应的旧 Catalog 键 | 4.1 前未单独暴露 `fs_view` / `meta_client` TTL | `meta.cache.hudi.fs_view.*`、`meta.cache.hudi.meta_client.*` | 这是 4.1.x 中拆分出的新模块。若希望更快感知最新 file slice 或提交时间线,分别调低对应 `ttl-second`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要描述 TTL/全局缓存行为。升级后如果仍沿用旧理解,容易遗漏 `fs_view`、`meta_client` 这类新模块的单独配置。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Hudi 表的最新数据变动或 Schema 变更,可以将 `schema` 或 `partition` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭分区元数据缓存,以感知 Hudi 表的最新分区变动 + ALTER CATALOG hudi_ctl SET PROPERTIES ("meta.cache.hudi.partition.ttl-second" = "0"); + ``` +* **性能优化**:`ALTER CATALOG ... SET PROPERTIES` 的修改在 Hudi 中支持热生效(通过 HMS catalog 属性更新路径)。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hudi_ctl' AND engine_name = 'hudi' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Hudi 版本 当前依赖的 Hudi 版本为 0.15。推荐访问 0.14 版本以上的 Hudi 数据。 @@ -226,4 +294,3 @@ SELECT * from hudi_table@incr('beginTime'='xxx', ['endTime'='xxx'], ['hoodie.rea | Doris 版本 | 功能支持 | | ----------- | ----------------------------------------- | | 2.1.8/3.0.4 | Hudi 依赖升级到 0.15。新增 Hadoop Hudi JNI Scanner。 | - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/iceberg-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/iceberg-catalog.mdx index 99c370968387b..1d1a91c5bfa02 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/iceberg-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/iceberg-catalog.mdx @@ -87,6 +87,82 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Iceberg 的元数据进行缓存。元数据包括表结构(Schema)、表对象、View 对象和 Manifest 详情等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Iceberg Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `table` | `meta.cache.iceberg.table.` | 缓存 Iceberg 表元数据对象。影响:最新 Snapshot、Partition Spec、Sort Order、表属性等表级元数据在 Doris 中的可见性;若关闭,每次规划都会重新加载表元数据。 | +| `view` | `meta.cache.iceberg.view.` | 缓存 Iceberg View 元数据对象。影响:View 定义、Schema、属性变更在 Doris 中的可见性。 | +| `manifest` | `meta.cache.iceberg.manifest.` | 缓存 Manifest 详情。主要影响查询规划时重复读取 Manifest 文件的开销,通常不直接决定表或 Snapshot 是否可见。注意:该模块默认关闭,需手动启用。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Iceberg 表级元数据主要受 FE 全局缓存策略控制,`schema.cache.ttl-second` 是常见的 Catalog 兼容属性;详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议改写为 `meta.cache.iceberg.*`,并按需要分别配置表、View 和 Manifest 缓存。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Iceberg Catalog 兼容属性 | `meta.cache.iceberg.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Iceberg 表信息旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档) | `meta.cache.iceberg.table.ttl-second` | 控制表级元数据新鲜度。若希望每次查询都读取最新 Snapshot/表属性,设置为 `0`。 | +| 无一一对应的旧 Catalog 键 | 4.1 前未单独暴露 View / Manifest TTL | `meta.cache.iceberg.view.*`、`meta.cache.iceberg.manifest.*` | 这是 4.1.x 中拆分出的新模块。升级后如需保证最新 View 定义,单独调低 `view.ttl-second`;`manifest` 主要用于性能优化。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要描述 TTL/全局缓存行为,不覆盖这些新模块的独立开关和容量上限。升级时建议一并评估是否需要补充 `enable/capacity`。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Iceberg 表的最新快照或 Schema 变动,可以将 `schema` 或 `table` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭表对象缓存,以便感知快照变动 + ALTER CATALOG iceberg_ctl SET PROPERTIES ("meta.cache.iceberg.table.ttl-second" = "0"); + ``` +* **性能优化**: + * 启用 Manifest 缓存可以显著提升大表的查询规划速度: + ```sql + ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", + "meta.cache.iceberg.manifest.ttl-second" = "600" + ); + ``` + * `ALTER CATALOG ... SET PROPERTIES` 的修改在 Iceberg Catalog 中支持热生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'iceberg_ctl' AND engine_name = 'iceberg' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Iceberg 版本 | Doris 版本 | Iceberg SDK 版本 | diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/maxcompute-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/maxcompute-catalog.md index d86e2b4c12be1..0479f4a61cbfb 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/maxcompute-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/maxcompute-catalog.md @@ -111,6 +111,71 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中「通用属性」部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 MaxCompute 的元数据进行缓存。元数据包括表结构(Schema)和分区列表等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +MaxCompute Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.maxcompute.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | 缓存分区值列表。影响:分区裁剪、分区枚举,以及新增/删除分区何时在 Doris 中可见。若关闭,可实时查看到分区变动。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,MaxCompute 的 Schema 和分区相关缓存主要通过 Catalog 兼容属性或 FE 全局缓存策略控制。升级到 4.1.x 后,建议统一改写为 `meta.cache.maxcompute.*`。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 MaxCompute Catalog 兼容属性 | `meta.cache.maxcompute.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| MaxCompute 分区值旧模型 | 4.1 前 FE 全局缓存策略 | `meta.cache.maxcompute.partition_values.ttl-second` | 控制分区枚举与分区可见性。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要表达 TTL/全局缓存行为。升级后如果只迁移 TTL 而不评估 `enable/capacity`,其余行为会使用 4.1.x 的默认值。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 MaxCompute 表的最新分区变动或 Schema 变更,可以将 `schema` 或 `partition_values` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭分区值缓存,以感知 MaxCompute 表的最新分区 + ALTER CATALOG mc_ctl SET PROPERTIES ("meta.cache.maxcompute.partition_values.ttl-second" = "0"); + ``` +* **注意**:`meta.cache.maxcompute.*` 目前没有专门的热生效 hook。修改配置后,建议重建 Catalog 或重启 FE 以确保生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'mc_ctl' AND engine_name = 'maxcompute' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 MaxCompute 版本 仅支持公有云版本的 MaxCompute。私有云版本支持请联系 Doris 社区支持。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx index ed2bfee8a0f29..227223021b55f 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx @@ -90,6 +90,71 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Paimon 的元数据进行缓存。元数据包括表结构(Schema)和表对象等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Paimon Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.paimon.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `table` | `meta.cache.paimon.table.` | 缓存 Paimon 表元数据对象。影响:最新 Snapshot、Schema 演进、分支/标签引用等表级元数据在 Doris 中的可见性,同时减少查询规划时的元数据加载开销。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Paimon 的 Schema 与表级元数据主要遵循旧的 Catalog 兼容属性或 FE 全局缓存模型,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议改写为 `meta.cache.paimon.*`,并单独评估表级缓存是否需要更强的新鲜度。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Paimon Catalog 兼容属性 | `meta.cache.paimon.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Paimon 表级旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档中的旧表级元数据模型说明) | `meta.cache.paimon.table.ttl-second` | 控制最新 Snapshot/表级元数据的可见性。若希望每次查询都读取最新表快照,设置为 `0`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要表达 TTL/全局缓存行为。升级后建议不要只替换键名,还要同时评估是否需要显式配置 `enable/capacity`。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Paimon 表的最新数据变动或 Schema 变更,可以将 `schema` 或 `table` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭表对象缓存,以感知 Paimon 表的最新快照 + ALTER CATALOG paimon_ctl SET PROPERTIES ("meta.cache.paimon.table.ttl-second" = "0"); + ``` +* **性能优化**:`ALTER CATALOG ... SET PROPERTIES` 的修改在 Paimon Catalog 中支持热生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'paimon_ctl' AND engine_name = 'paimon' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Paimon 版本 当前依赖的 Paimon 版本为 1.0.0。 @@ -118,6 +183,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( * [Google Cloud Storage](../storages/gcs.md) +* [Apache Ozone](../storages/ozone.md)(自 4.0.4 起支持) + * [阿里云 OSS](../storages/aliyun-oss.md) * [腾讯云 COS](../storages/tencent-cos.md) @@ -267,6 +334,23 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ); ``` + + 自 4.0.4 起支持 + ```sql + CREATE CATALOG paimon_hms_on_ozone_catalog PROPERTIES ( + 'type' = 'paimon', + 'paimon.catalog.type' = 'hms', + 'warehouse' = 's3a://test-bucket/paimon-warehouse', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9383', + 'fs.ozone.support' = 'true', + 'ozone.endpoint' = 'http://ozone-s3g:9878', + 'ozone.access_key' = '', + 'ozone.secret_key' = '', + 'ozone.region' = 'us-east-1', + 'ozone.use_path_style' = 'true' + ); + ``` + ```sql CREATE CATALOG paimon_hms_on_minio_catalog PROPERTIES ( diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalog-overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalog-overview.md index 5b096d18f0096..713fda0620664 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalog-overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalog-overview.md @@ -53,8 +53,8 @@ CREATE CATALOG iceberg_catalog PROPERTIES ( | `include_database_list` | 支持只同步指定的多个 Database,以 `,` 分隔。默认同步所有 Database。Database 名称是大小写敏感的。当外部数据源有大量 Database,但仅需访问个别 Database 时,可以使用此参数,避免大量的元数据同步。 | `'include_database_list' = 'db1,db2'` | | `exclude_database_list` | 支持指定不需要同步的多个 Database,以 `,` 分隔。默认不做任何过滤,同步所有 Database。Database 名称是大小写敏感的。适用场景同上,反向排除不需要访问的数据库。如果冲突,`exclude` 优先级高于 `include`。 | `'exclude_database_list' = 'db1,db2'` | | `include_table_list` | 支持只同步指定的多个表,以 `db.tbl` 格式指定,多个表之间以 `,` 分隔。设置后,列举某个 Database 下的表时将仅返回指定的表,而不会从远端元数据服务获取完整的表列表。适用于外部数据源表数量庞大、获取全量表列表可能超时的场景。 | `'include_table_list' = 'db1.tbl1,db1.tbl2,db2.tbl3'` | -| `lower_case_table_names` | Catalog 级别的表名大小写控制。取值及含义见下方 [表名大小写](#表名大小写lower_case_table_names) 小节。默认值继承全局变量 `lower_case_table_names` 的设置。 | `'lower_case_table_names' = '1'` | -| `lower_case_database_names` | Catalog 级别的数据库名大小写控制。取值及含义见下方 [数据库名大小写](#数据库名大小写lower_case_database_names) 小节。默认值为 `0`(大小写敏感)。 | `'lower_case_database_names' = '2'` | +| `lower_case_table_names` | Catalog 级别的表名大小写控制。取值及含义见下方[表名大小写](#表名大小写lower_case_table_names)小节。默认值继承全局变量 `lower_case_table_names` 的设置。 | `'lower_case_table_names' = '1'` | +| `lower_case_database_names` | Catalog 级别的数据库名大小写控制。取值及含义见下方[数据库名大小写](#数据库名大小写lower_case_database_names)小节。默认值为 `0`(大小写敏感)。 | `'lower_case_database_names' = '2'` | ### 指定表列表 @@ -83,7 +83,7 @@ CREATE CATALOG hive_catalog PROPERTIES ( 此属性可以与 `include_database_list` 配合使用。例如先通过 `include_database_list` 过滤出需要的 Database,再通过 `include_table_list` 进一步精确指定需要的表。 ::: -### 表名大小写 +### 表名大小写{#表名大小写lower_case_table_names} 该功能自 4.1.0 版本起支持。 @@ -109,7 +109,7 @@ CREATE CATALOG hive_catalog PROPERTIES ( 当 `lower_case_table_names` 设置为 `1` 或 `2` 时,如果远端元数据中存在仅大小写不同的同名表(如 `MyTable` 和 `mytable`),可能会导致冲突。Doris 会检测此类冲突并报错。 ::: -### 数据库名大小写 +### 数据库名大小写{#数据库名大小写lower_case_database_names} 该功能自 4.1.0 版本起支持。 @@ -243,7 +243,7 @@ SET PROPERTY default_init_catalog=hive_catalog; 注意 2:如果用户属性 `default_init_catalog` 设置的数据目录已经不存在,则自动切换到默认的 `internal` 数据目录。 -注意 3:该功能从 3.1.x 版本开始生效。 +注意 3:该功能自 3.1.x 版本起支持。 ### 简单查询 @@ -324,7 +324,13 @@ REFRESH TABLE catalog_name.db_name.table_name; Doris 也支持关闭元数据缓存,以便能够实时访问到最新的元数据。 -关于元数据缓存的详细介绍和配置,请参阅:[元数据缓存](./meta-cache.md) +- Doris 4.1.x 之前:请参阅[元数据缓存](./meta-cache.md)。 +- Doris 4.1.x 及之后:请参阅各 Catalog 文档中的“元数据缓存”章节。 + - [Hive Catalog](./catalogs/hive-catalog.md#meta-cache) + - [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache) + - [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache) + - [Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache) + - [MaxCompute Catalog](./catalogs/maxcompute-catalog.md#meta-cache) ## 修改数据目录 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/hive-catalog.mdx index dcf238ba9a637..c1f7fd06362c9 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/hive-catalog.mdx @@ -78,6 +78,82 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[ 数据目录概述 ](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Hive 的元数据进行缓存。元数据包括表结构(Schema)、分区列表、分区属性和文件列表等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Hive Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.hive.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition_values` | `meta.cache.hive.partition_values.` | 缓存分区值/分区名称列表。影响:`SHOW PARTITIONS`、分区枚举、分区裁剪,以及外部新增/删除分区何时在 Doris 中可见。若关闭,可实时查看到分区变动。 | +| `partition` | `meta.cache.hive.partition.` | 缓存分区属性(Location、InputFormat、Serde 等)。影响:单个分区位置、格式、Serde 等属性变更的可见性。 | +| `file` | `meta.cache.hive.file.` | 缓存文件列表。影响:新增/删除文件、文件大小变化被 Doris 感知的时效性,同时减少远端 LIST 操作开销。若关闭,每次查询都会重新加载文件列表。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Hive 元数据缓存一部分通过 Catalog 兼容属性控制,一部分仍受 FE 全局缓存参数控制,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议显式改写为 `meta.cache.hive.*`,不要继续沿用旧键名。 + +| 4.1 前属性键 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.schema.ttl-second` | 仅对应 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`;`enable` 和 `capacity` 需按需单独配置。 | +| `partition.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.partition_values.ttl-second` | 仅对应分区列表新鲜度。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | +| `file.meta.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.file.ttl-second` | 仅对应文件列表新鲜度。若希望新增/删除文件每次查询立即可见,设置为 `0`。 | + +`meta.cache.hive.partition.*` 是 4.1.x 中单独可调的新模块,4.1 前没有一一对应的 Catalog 级 TTL 键。若您关心分区 Location、Serde、InputFormat 等属性变更的可见性,需要在升级后单独设置它。 + +4.1.x 的统一模型把每个缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧键只表达 TTL,不表达是否启用和容量上限。升级后如果只完成键名替换而不评估 `enable/capacity`,则其余行为会落到 4.1.x 的默认值。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到外部数据源的最新分区或文件变动,可以将对应的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭文件列表缓存,实时看到文件变动 + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.file.ttl-second" = "0"); + -- 关闭分区值缓存,实时看到新增分区 + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.partition_values.ttl-second" = "0"); + ``` +* **性能优化**:对于元数据变动不频繁的场景,建议适当增大 `capacity` 和 `ttl-second` 以减少对 Hive Metastore 和文件系统的访问压力。 + +:::caution +**Hive Catalog 注意事项**:Hive 的 `meta.cache.hive.*` 属性修改**不支持热生效**。修改配置后,必须重建 Catalog 或重启 FE 节点才能应用新的缓存配置。 +::: + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hive_ctl' AND engine_name = 'hive' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Hive 版本 支持 Hive 1.x,2.x,3.x,4.x。 @@ -94,6 +170,14 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( > 注意:不同 Doris 版本所支持的服务类型和参数略有区别,请参考【基础示例】章节。 +### Hive Catalog 功能支持矩阵 + +| 元数据服务 | 表查询 | 视图查询 |DDL 操作| 数据写回| +| ---------- | -------- | -------- | -------- | -------- | +| Hive | ✅ | ✅ | ✅ | ✅ | +| AWS Glue | ✅ | ✅ | ❌ | ❌ | +| DLF | ✅ | ✅ | ✅ | ✅ | + ### 支持的存储系统 * [HDFS](../storages/hdfs.md) @@ -104,6 +188,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( * [Azure Blob](../storages/azure-blob.md) +* [Apache Ozone](../storages/ozone.md)(自 4.0.4 起支持) + * [阿里云 OSS](../storages/aliyun-oss.md) * [腾讯云 COS](../storages/tencent-cos.md) @@ -116,14 +202,6 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( > > 不同 Doris 版本所支持的服务类型和参数略有区别,请参考【基础示例】章节。 -### Hive Catalog 功能支持矩阵 - -| 元数据服务 | 表查询 | 视图查询 |DDL 操作| 数据写回| -| ---------- | -------- | -------- | -------- | -------- | -| Hive | ✅ | ✅ | ✅ | ✅ | -| AWS Glue | ✅ | ✅ | ❌ | ❌ | -| DLF | ✅ | ✅ | ✅ | ✅ | - ### 支持的数据格式 * Hive @@ -370,6 +448,21 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ); ``` + + 自 4.0.4 起支持 + ```sql + CREATE CATALOG `hive_hms_on_ozone_new_catalog` PROPERTIES ( + 'type' = 'hms', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9383', + 'fs.ozone.support' = 'true', + 'ozone.endpoint' = 'http://ozone-s3g:9878', + 'ozone.access_key' = '', + 'ozone.secret_key' = '', + 'ozone.region' = 'us-east-1', + 'ozone.use_path_style' = 'true' + ); + ``` + ```sql CREATE CATALOG test_hive_on_hms_minio_catalog PROPERTIES ( @@ -1101,4 +1194,3 @@ DROP DATABASE [IF EXISTS] hive_ctl.hive_db; | -------- | ------------------------------------ | | 2.1.6 | 支持 Hive 表数据写回 | | 3.0.4 | 支持 JsonSerDe 格式的 Hive 表。支持 Hive4 的事务表。 | - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/hudi-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/hudi-catalog.md index cb4ac7cc702bc..11aa62f55092f 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/hudi-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/hudi-catalog.md @@ -51,6 +51,74 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | 是否使用 Hive Metastore 已同步的分区信息。如果为 true,则会直接从 Hive Metastore 中获取分区信息。否则,会从文件系统的元数据文件中获取分区信息。通过 Hive Metastore 获取信息性能更好,但需要用户保证最新的元数据已经同步到了 Hive Metastore。 | false | +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Hudi 的元数据进行缓存。元数据包括表结构(Schema)、分区信息、FS View 和 Meta Client 对象等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Hudi Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.hudi.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition` | `meta.cache.hudi.partition.` | 缓存 Hudi 分区相关元数据。影响:分区发现、分区裁剪,以及新增/删除分区何时在 Doris 中可见。 | +| `fs_view` | `meta.cache.hudi.fs_view.` | 缓存 Hudi 文件系统视图相关元数据。影响:查询规划时选择到的最新 base file / log file 以及 file slice 视图的新鲜度。 | +| `meta_client` | `meta.cache.hudi.meta_client.` | 缓存 Hudi Meta Client 对象。影响:时间线(timeline)、表配置等底层元数据重新加载的频率,以及提交/表配置变更何时被感知。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Hudi 的 Schema 缓存有 Catalog 兼容属性,分区与表级元数据则主要遵循旧的 FE 全局缓存模型,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议统一改写为 `meta.cache.hudi.*`,并分别配置分区、FS View 和 Meta Client。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Hudi Catalog 兼容属性 | `meta.cache.hudi.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Hudi 分区旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档) | `meta.cache.hudi.partition.ttl-second` | 控制分区发现与分区可见性。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | +| 无一一对应的旧 Catalog 键 | 4.1 前未单独暴露 `fs_view` / `meta_client` TTL | `meta.cache.hudi.fs_view.*`、`meta.cache.hudi.meta_client.*` | 这是 4.1.x 中拆分出的新模块。若希望更快感知最新 file slice 或提交时间线,分别调低对应 `ttl-second`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要描述 TTL/全局缓存行为。升级后如果仍沿用旧理解,容易遗漏 `fs_view`、`meta_client` 这类新模块的单独配置。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Hudi 表的最新数据变动或 Schema 变更,可以将 `schema` 或 `partition` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭分区元数据缓存,以感知 Hudi 表的最新分区变动 + ALTER CATALOG hudi_ctl SET PROPERTIES ("meta.cache.hudi.partition.ttl-second" = "0"); + ``` +* **性能优化**:`ALTER CATALOG ... SET PROPERTIES` 的修改在 Hudi 中支持热生效(通过 HMS catalog 属性更新路径)。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hudi_ctl' AND engine_name = 'hudi' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Hudi 版本 当前依赖的 Hudi 版本为 0.15。推荐访问 0.14 版本以上的 Hudi 数据。 @@ -226,4 +294,3 @@ SELECT * from hudi_table@incr('beginTime'='xxx', ['endTime'='xxx'], ['hoodie.rea | Doris 版本 | 功能支持 | | ----------- | ----------------------------------------- | | 2.1.8/3.0.4 | Hudi 依赖升级到 0.15。新增 Hadoop Hudi JNI Scanner。 | - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/iceberg-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/iceberg-catalog.mdx index 018bc76a690be..2e167dbc9c7e6 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/iceberg-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/iceberg-catalog.mdx @@ -87,6 +87,82 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Iceberg 的元数据进行缓存。元数据包括表结构(Schema)、表对象、View 对象和 Manifest 详情等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Iceberg Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `table` | `meta.cache.iceberg.table.` | 缓存 Iceberg 表元数据对象。影响:最新 Snapshot、Partition Spec、Sort Order、表属性等表级元数据在 Doris 中的可见性;若关闭,每次规划都会重新加载表元数据。 | +| `view` | `meta.cache.iceberg.view.` | 缓存 Iceberg View 元数据对象。影响:View 定义、Schema、属性变更在 Doris 中的可见性。 | +| `manifest` | `meta.cache.iceberg.manifest.` | 缓存 Manifest 详情。主要影响查询规划时重复读取 Manifest 文件的开销,通常不直接决定表或 Snapshot 是否可见。注意:该模块默认关闭,需手动启用。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Iceberg 表级元数据主要受 FE 全局缓存策略控制,`schema.cache.ttl-second` 是常见的 Catalog 兼容属性;详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议改写为 `meta.cache.iceberg.*`,并按需要分别配置表、View 和 Manifest 缓存。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Iceberg Catalog 兼容属性 | `meta.cache.iceberg.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Iceberg 表信息旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档) | `meta.cache.iceberg.table.ttl-second` | 控制表级元数据新鲜度。若希望每次查询都读取最新 Snapshot/表属性,设置为 `0`。 | +| 无一一对应的旧 Catalog 键 | 4.1 前未单独暴露 View / Manifest TTL | `meta.cache.iceberg.view.*`、`meta.cache.iceberg.manifest.*` | 这是 4.1.x 中拆分出的新模块。升级后如需保证最新 View 定义,单独调低 `view.ttl-second`;`manifest` 主要用于性能优化。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要描述 TTL/全局缓存行为,不覆盖这些新模块的独立开关和容量上限。升级时建议一并评估是否需要补充 `enable/capacity`。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Iceberg 表的最新快照或 Schema 变动,可以将 `schema` 或 `table` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭表对象缓存,以便感知快照变动 + ALTER CATALOG iceberg_ctl SET PROPERTIES ("meta.cache.iceberg.table.ttl-second" = "0"); + ``` +* **性能优化**: + * 启用 Manifest 缓存可以显著提升大表的查询规划速度: + ```sql + ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", + "meta.cache.iceberg.manifest.ttl-second" = "600" + ); + ``` + * `ALTER CATALOG ... SET PROPERTIES` 的修改在 Iceberg Catalog 中支持热生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'iceberg_ctl' AND engine_name = 'iceberg' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Iceberg 版本 | Doris 版本 | Iceberg SDK 版本 | diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/maxcompute-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/maxcompute-catalog.md index d86e2b4c12be1..0479f4a61cbfb 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/maxcompute-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/maxcompute-catalog.md @@ -111,6 +111,71 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中「通用属性」部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 MaxCompute 的元数据进行缓存。元数据包括表结构(Schema)和分区列表等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +MaxCompute Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.maxcompute.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | 缓存分区值列表。影响:分区裁剪、分区枚举,以及新增/删除分区何时在 Doris 中可见。若关闭,可实时查看到分区变动。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,MaxCompute 的 Schema 和分区相关缓存主要通过 Catalog 兼容属性或 FE 全局缓存策略控制。升级到 4.1.x 后,建议统一改写为 `meta.cache.maxcompute.*`。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 MaxCompute Catalog 兼容属性 | `meta.cache.maxcompute.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| MaxCompute 分区值旧模型 | 4.1 前 FE 全局缓存策略 | `meta.cache.maxcompute.partition_values.ttl-second` | 控制分区枚举与分区可见性。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要表达 TTL/全局缓存行为。升级后如果只迁移 TTL 而不评估 `enable/capacity`,其余行为会使用 4.1.x 的默认值。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 MaxCompute 表的最新分区变动或 Schema 变更,可以将 `schema` 或 `partition_values` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭分区值缓存,以感知 MaxCompute 表的最新分区 + ALTER CATALOG mc_ctl SET PROPERTIES ("meta.cache.maxcompute.partition_values.ttl-second" = "0"); + ``` +* **注意**:`meta.cache.maxcompute.*` 目前没有专门的热生效 hook。修改配置后,建议重建 Catalog 或重启 FE 以确保生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'mc_ctl' AND engine_name = 'maxcompute' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 MaxCompute 版本 仅支持公有云版本的 MaxCompute。私有云版本支持请联系 Doris 社区支持。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/paimon-catalog.mdx index e124af6bc5e0e..227223021b55f 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalogs/paimon-catalog.mdx @@ -15,8 +15,6 @@ Doris 支持通过多种元数据服务访问 Paimon 表元数据,并进行 Pa [使用 Docker 快速体验 Apache Doris & Paimon](../best-practices/doris-paimon.md) - - ## 适用场景 | 场景 | 说明 | @@ -92,6 +90,71 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Paimon 的元数据进行缓存。元数据包括表结构(Schema)和表对象等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 缓存属性配置(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Paimon Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.paimon.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `table` | `meta.cache.paimon.table.` | 缓存 Paimon 表元数据对象。影响:最新 Snapshot、Schema 演进、分支/标签引用等表级元数据在 Doris 中的可见性,同时减少查询规划时的元数据加载开销。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Paimon 的 Schema 与表级元数据主要遵循旧的 Catalog 兼容属性或 FE 全局缓存模型,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议改写为 `meta.cache.paimon.*`,并单独评估表级缓存是否需要更强的新鲜度。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Paimon Catalog 兼容属性 | `meta.cache.paimon.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Paimon 表级旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档中的旧表级元数据模型说明) | `meta.cache.paimon.table.ttl-second` | 控制最新 Snapshot/表级元数据的可见性。若希望每次查询都读取最新表快照,设置为 `0`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要表达 TTL/全局缓存行为。升级后建议不要只替换键名,还要同时评估是否需要显式配置 `enable/capacity`。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Paimon 表的最新数据变动或 Schema 变更,可以将 `schema` 或 `table` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭表对象缓存,以感知 Paimon 表的最新快照 + ALTER CATALOG paimon_ctl SET PROPERTIES ("meta.cache.paimon.table.ttl-second" = "0"); + ``` +* **性能优化**:`ALTER CATALOG ... SET PROPERTIES` 的修改在 Paimon Catalog 中支持热生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'paimon_ctl' AND engine_name = 'paimon' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Paimon 版本 当前依赖的 Paimon 版本为 1.0.0。 @@ -120,6 +183,8 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( * [Google Cloud Storage](../storages/gcs.md) +* [Apache Ozone](../storages/ozone.md)(自 4.0.4 起支持) + * [阿里云 OSS](../storages/aliyun-oss.md) * [腾讯云 COS](../storages/tencent-cos.md) @@ -269,6 +334,23 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( ); ``` + + 自 4.0.4 起支持 + ```sql + CREATE CATALOG paimon_hms_on_ozone_catalog PROPERTIES ( + 'type' = 'paimon', + 'paimon.catalog.type' = 'hms', + 'warehouse' = 's3a://test-bucket/paimon-warehouse', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9383', + 'fs.ozone.support' = 'true', + 'ozone.endpoint' = 'http://ozone-s3g:9878', + 'ozone.access_key' = '', + 'ozone.secret_key' = '', + 'ozone.region' = 'us-east-1', + 'ozone.use_path_style' = 'true' + ); + ``` + ```sql CREATE CATALOG paimon_hms_on_minio_catalog PROPERTIES ( @@ -439,9 +521,6 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( -aaa -[test](./a.md) -
2.1 & 3.0 版本 @@ -465,7 +544,6 @@ aaa
- ### FileSystem Metastore
diff --git a/versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx b/versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx index 190dd3bce7c7d..c203e44e468dd 100644 --- a/versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx +++ b/versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx @@ -76,6 +76,78 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering common attributes. Please see the "Common Properties" section in the [Catalog Overview](../catalog-overview.md). +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Hive metadata. Metadata includes table structure (Schema), partition lists, partition properties, and file lists. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Hive Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Hive Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.hive.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition_values` | `meta.cache.hive.partition_values.` | Caches partition values/names list. Impact: Partition pruning and enumeration. If disabled, new external partitions can be seen in real-time. | +| `partition` | `meta.cache.hive.partition.` | Caches partition properties (Location, input format, etc.). Impact: Specific metadata of partitions. | +| `file` | `meta.cache.hive.file.` | Caches file lists. Impact: Reduces remote LIST operation overhead. If disabled, file changes can be seen in real-time. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.hive.schema.ttl-second` | Expiration time of table structure cache | +| `partition.cache.ttl-second` | `meta.cache.hive.partition_values.ttl-second` | Expiration time of partition value cache | +| `file.meta.cache.ttl-second` | `meta.cache.hive.file.ttl-second` | Expiration time of file list cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest partition or file changes in the external data source, you can set the corresponding `ttl-second` to `0`. + ```sql + -- Disable file list cache to see file changes in real-time + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.file.ttl-second" = "0"); + -- Disable partition value cache to see new partitions in real-time + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.partition_values.ttl-second" = "0"); + ``` +* **Performance optimization**: For scenarios where metadata changes are infrequent, it is recommended to appropriately increase `capacity` and `ttl-second` to reduce access pressure on Hive Metastore and file systems. + +:::caution +**Hive Catalog Note**: Changes to `meta.cache.hive.*` properties **do not support hot-reload**. To ensure new configurations take effect, you must recreate the catalog or restart the FE node. +::: + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hive_ctl' AND engine_name = 'hive' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Hive Versions Supports Hive 1.x, 2.x, 3.x, and 4.x. @@ -104,6 +176,7 @@ Hive transactional tables are supported from version 3.x onwards. For details, r * [AWS S3](../storages/s3.md) * [Google Cloud Storage](../storages/gcs.md) * [Azure Blob](../storages/azure-blob.md) +* [Apache Ozone](../storages/ozone.md) (supported since 4.0.4) * [Alibaba Cloud OSS](../storages/aliyun-oss.md) * [Tencent Cloud COS](../storages/tencent-cos.md) * [Huawei Cloud OBS](../storages/huawei-obs.md) @@ -359,6 +432,21 @@ Hive transactional tables are supported from version 3.x onwards. For details, r ); ``` + + Supported since 4.0.4 + ```sql + CREATE CATALOG `hive_hms_on_ozone_new_catalog` PROPERTIES ( + 'type' = 'hms', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9383', + 'fs.ozone.support' = 'true', + 'ozone.endpoint' = 'http://ozone-s3g:9878', + 'ozone.access_key' = '', + 'ozone.secret_key' = '', + 'ozone.region' = 'us-east-1', + 'ozone.use_path_style' = 'true' + ); + ``` + ```sql CREATE CATALOG test_hive_on_hms_minio_catalog PROPERTIES ( diff --git a/versioned_docs/version-3.x/lakehouse/catalogs/hudi-catalog.md b/versioned_docs/version-3.x/lakehouse/catalogs/hudi-catalog.md index 22b8f227ac30d..b476e54d67bf3 100644 --- a/versioned_docs/version-3.x/lakehouse/catalogs/hudi-catalog.md +++ b/versioned_docs/version-3.x/lakehouse/catalogs/hudi-catalog.md @@ -51,6 +51,70 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | Whether to use the partition information already synchronized by Hive Metastore. If true, partition information will be obtained directly from Hive Metastore. Otherwise, it will be obtained from the metadata file of the file system. Obtaining information from Hive Metastore is more efficient, but users need to ensure that the latest metadata has been synchronized to Hive Metastore. | false | +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Hudi metadata. Metadata includes table structure (Schema), partition information, FS View, and Meta Client objects. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Hudi-related external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Hudi Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.hudi.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition` | `meta.cache.hudi.partition.` | Caches Hudi partition-related metadata. Impact: Used for partition discovery and pruning. | +| `fs_view` | `meta.cache.hudi.fs_view.` | Caches Hudi filesystem view related metadata. | +| `meta_client` | `meta.cache.hudi.meta_client.` | Caches Hudi Meta Client objects. Impact: Reduces redundant loading of Hudi metadata. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.hudi.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest data changes or schema changes for Hudi tables, you can set the `ttl-second` for `schema` or `partition` to `0`. + ```sql + -- Disable partition metadata cache to detect the latest partition changes in Hudi tables + ALTER CATALOG hudi_ctl SET PROPERTIES ("meta.cache.hudi.partition.ttl-second" = "0"); + ``` +* **Performance optimization**: Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Hudi (via the HMS catalog property update path). + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hudi_ctl' AND engine_name = 'hudi' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Hudi Versions The current dependent Hudi version is 0.15. It is recommended to access Hudi data version 0.14 and above. diff --git a/versioned_docs/version-3.x/lakehouse/catalogs/iceberg-catalog.mdx b/versioned_docs/version-3.x/lakehouse/catalogs/iceberg-catalog.mdx index 4e68627f658ad..7067b258be77a 100644 --- a/versioned_docs/version-3.x/lakehouse/catalogs/iceberg-catalog.mdx +++ b/versioned_docs/version-3.x/lakehouse/catalogs/iceberg-catalog.mdx @@ -85,6 +85,78 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering general properties. See the [Catalog Overview](../catalog-overview.md) for details on common properties. +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Iceberg metadata. Metadata includes table structure (Schema), table objects, view objects, and manifest details. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Iceberg Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Iceberg Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `table` | `meta.cache.iceberg.table.` | Caches Iceberg table metadata objects. Impact: Reduces Catalog/Metastore round-trips. | +| `view` | `meta.cache.iceberg.view.` | Caches Iceberg View metadata objects. | +| `manifest` | `meta.cache.iceberg.manifest.` | Caches manifest details. Impact: Reduces repeated manifest access overhead. Note: This module is disabled by default and must be enabled manually. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.iceberg.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest snapshots or schema changes for Iceberg tables, you can set the `ttl-second` for `schema` or `table` to `0`. + ```sql + -- Disable table object cache to detect snapshot changes + ALTER CATALOG iceberg_ctl SET PROPERTIES ("meta.cache.iceberg.table.ttl-second" = "0"); + ``` +* **Performance optimization**: + * Enabling manifest cache can significantly speed up query planning for large tables: + ```sql + ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", + "meta.cache.iceberg.manifest.ttl-second" = "600" + ); + ``` + * Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Iceberg Catalog. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'iceberg_ctl' AND engine_name = 'iceberg' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Iceberg Versions | Doris Version | Iceberg SDK Version | diff --git a/versioned_docs/version-3.x/lakehouse/catalogs/maxcompute-catalog.md b/versioned_docs/version-3.x/lakehouse/catalogs/maxcompute-catalog.md index e611f96f12f0e..4cf40f7f35633 100644 --- a/versioned_docs/version-3.x/lakehouse/catalogs/maxcompute-catalog.md +++ b/versioned_docs/version-3.x/lakehouse/catalogs/maxcompute-catalog.md @@ -111,6 +111,68 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the "Common Properties" section in [Catalog Overview](../catalog-overview.md). +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches MaxCompute metadata. Metadata includes table structure (Schema) and partition lists. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, MaxCompute Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +MaxCompute Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.maxcompute.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | Caches partition value lists. Impact: Partition pruning and enumeration. If disabled, new external partitions can be seen in real-time. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.maxcompute.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest partition or schema changes for MaxCompute tables, you can set the `ttl-second` for `schema` or `partition_values` to `0`. + ```sql + -- Disable partition value cache to detect the latest partitions in MaxCompute tables + ALTER CATALOG mc_ctl SET PROPERTIES ("meta.cache.maxcompute.partition_values.ttl-second" = "0"); + ``` +* **Note**: `meta.cache.maxcompute.*` currently does not have a dedicated hot-reload hook. After changing the configuration, it is recommended to recreate the Catalog or restart FE to ensure it takes effect. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'mc_ctl' AND engine_name = 'maxcompute' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported MaxCompute Versions Only the public cloud version of MaxCompute is supported. For private cloud version support, please contact Doris community support. diff --git a/versioned_docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx b/versioned_docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx index dca01450d13ba..087bc4dcd5c71 100644 --- a/versioned_docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx +++ b/versioned_docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx @@ -90,6 +90,68 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the [Catalog Overview](../catalog-overview.md) section on [Common Properties]. +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Paimon metadata. Metadata includes table structure (Schema) and table objects. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Paimon Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Paimon Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.paimon.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `table` | `meta.cache.paimon.table.` | Caches Paimon table metadata objects. Impact: Reduces metadata loading overhead during query planning. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.paimon.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest data changes or schema changes for Paimon tables, you can set the `ttl-second` for `schema` or `table` to `0`. + ```sql + -- Disable table object cache to detect the latest snapshots of Paimon tables + ALTER CATALOG paimon_ctl SET PROPERTIES ("meta.cache.paimon.table.ttl-second" = "0"); + ``` +* **Performance optimization**: Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Paimon Catalog. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'paimon_ctl' AND engine_name = 'paimon' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Paimon Versions The currently dependent Paimon version is 1.0.0. @@ -118,6 +180,8 @@ The currently dependent Paimon version is 1.0.0. * [Google Cloud Storage](../storages/gcs.md) +* [Apache Ozone](../storages/ozone.md) (supported since 4.0.4) + * [Alibaba Cloud OSS](../storages/aliyun-oss.md) * [Tencent Cloud COS](../storages/tencent-cos.md) @@ -265,6 +329,23 @@ Supported since version 4.0.3, `timestamp_with_local_time_zone` can be mapped to ); ``` + + Supported since 4.0.4 + ```sql + CREATE CATALOG paimon_hms_on_ozone_catalog PROPERTIES ( + 'type' = 'paimon', + 'paimon.catalog.type' = 'hms', + 'warehouse' = 's3a://test-bucket/paimon-warehouse', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9383', + 'fs.ozone.support' = 'true', + 'ozone.endpoint' = 'http://ozone-s3g:9878', + 'ozone.access_key' = '', + 'ozone.secret_key' = '', + 'ozone.region' = 'us-east-1', + 'ozone.use_path_style' = 'true' + ); + ``` + ```sql CREATE CATALOG paimon_hms_on_minio_catalog PROPERTIES ( diff --git a/versioned_docs/version-4.x/lakehouse/catalog-overview.md b/versioned_docs/version-4.x/lakehouse/catalog-overview.md index 2051135a0f283..0e798e72a00ee 100644 --- a/versioned_docs/version-4.x/lakehouse/catalog-overview.md +++ b/versioned_docs/version-4.x/lakehouse/catalog-overview.md @@ -83,7 +83,7 @@ Behavior after setting: This property can be used in combination with `include_database_list`. For example, first filter the required databases using `include_database_list`, then further specify the required tables using `include_table_list`. ::: -### Table Name Case Sensitivity +### Table Name Case Sensitivity {#table-name-case-sensitivity-lower_case_table_names} This feature is supported since version 4.1.0. @@ -109,7 +109,7 @@ CREATE CATALOG hive_catalog PROPERTIES ( When `lower_case_table_names` is set to `1` or `2`, if tables with names that differ only in case exist in the remote metadata (such as `MyTable` and `mytable`), conflicts may occur. Doris will detect such conflicts and report an error. ::: -### Database Name Case Sensitivity +### Database Name Case Sensitivity {#database-name-case-sensitivity-lower_case_database_names} This feature is supported since version 4.1.0. @@ -243,7 +243,7 @@ Note 1: If a data catalog is explicitly specified in the MySQL command line or J Note 2: If the data catalog set by the user property `default_init_catalog` no longer exists, the session will automatically switch to the default `internal` data catalog. -Note 3: This feature is available starting from version 3.1.x. +Note 3: This feature is supported since version 3.1.x. ### Simple Queries @@ -324,7 +324,13 @@ REFRESH TABLE catalog_name.db_name.table_name; Doris also supports disabling metadata caching to enable real-time access to the latest metadata. -For detailed information and configuration of metadata caching, please refer to: [Metadata Cache](./meta-cache.md) +- Before Doris 4.1.x: please refer to [Metadata Cache](./meta-cache.md). +- Doris 4.1.x and later: please refer to the "Metadata Cache" section in each Catalog documentation. + - [Hive Catalog](./catalogs/hive-catalog.md#meta-cache) + - [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache) + - [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache) + - [Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache) + - [MaxCompute Catalog](./catalogs/maxcompute-catalog.md#meta-cache) ## Modifying Data Catalogs diff --git a/versioned_docs/version-4.x/lakehouse/catalogs/hive-catalog.mdx b/versioned_docs/version-4.x/lakehouse/catalogs/hive-catalog.mdx index 190dd3bce7c7d..c203e44e468dd 100644 --- a/versioned_docs/version-4.x/lakehouse/catalogs/hive-catalog.mdx +++ b/versioned_docs/version-4.x/lakehouse/catalogs/hive-catalog.mdx @@ -76,6 +76,78 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering common attributes. Please see the "Common Properties" section in the [Catalog Overview](../catalog-overview.md). +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Hive metadata. Metadata includes table structure (Schema), partition lists, partition properties, and file lists. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Hive Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Hive Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.hive.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition_values` | `meta.cache.hive.partition_values.` | Caches partition values/names list. Impact: Partition pruning and enumeration. If disabled, new external partitions can be seen in real-time. | +| `partition` | `meta.cache.hive.partition.` | Caches partition properties (Location, input format, etc.). Impact: Specific metadata of partitions. | +| `file` | `meta.cache.hive.file.` | Caches file lists. Impact: Reduces remote LIST operation overhead. If disabled, file changes can be seen in real-time. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.hive.schema.ttl-second` | Expiration time of table structure cache | +| `partition.cache.ttl-second` | `meta.cache.hive.partition_values.ttl-second` | Expiration time of partition value cache | +| `file.meta.cache.ttl-second` | `meta.cache.hive.file.ttl-second` | Expiration time of file list cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest partition or file changes in the external data source, you can set the corresponding `ttl-second` to `0`. + ```sql + -- Disable file list cache to see file changes in real-time + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.file.ttl-second" = "0"); + -- Disable partition value cache to see new partitions in real-time + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.partition_values.ttl-second" = "0"); + ``` +* **Performance optimization**: For scenarios where metadata changes are infrequent, it is recommended to appropriately increase `capacity` and `ttl-second` to reduce access pressure on Hive Metastore and file systems. + +:::caution +**Hive Catalog Note**: Changes to `meta.cache.hive.*` properties **do not support hot-reload**. To ensure new configurations take effect, you must recreate the catalog or restart the FE node. +::: + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hive_ctl' AND engine_name = 'hive' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Hive Versions Supports Hive 1.x, 2.x, 3.x, and 4.x. @@ -104,6 +176,7 @@ Hive transactional tables are supported from version 3.x onwards. For details, r * [AWS S3](../storages/s3.md) * [Google Cloud Storage](../storages/gcs.md) * [Azure Blob](../storages/azure-blob.md) +* [Apache Ozone](../storages/ozone.md) (supported since 4.0.4) * [Alibaba Cloud OSS](../storages/aliyun-oss.md) * [Tencent Cloud COS](../storages/tencent-cos.md) * [Huawei Cloud OBS](../storages/huawei-obs.md) @@ -359,6 +432,21 @@ Hive transactional tables are supported from version 3.x onwards. For details, r ); ``` + + Supported since 4.0.4 + ```sql + CREATE CATALOG `hive_hms_on_ozone_new_catalog` PROPERTIES ( + 'type' = 'hms', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9383', + 'fs.ozone.support' = 'true', + 'ozone.endpoint' = 'http://ozone-s3g:9878', + 'ozone.access_key' = '', + 'ozone.secret_key' = '', + 'ozone.region' = 'us-east-1', + 'ozone.use_path_style' = 'true' + ); + ``` + ```sql CREATE CATALOG test_hive_on_hms_minio_catalog PROPERTIES ( diff --git a/versioned_docs/version-4.x/lakehouse/catalogs/hudi-catalog.md b/versioned_docs/version-4.x/lakehouse/catalogs/hudi-catalog.md index 22b8f227ac30d..b476e54d67bf3 100644 --- a/versioned_docs/version-4.x/lakehouse/catalogs/hudi-catalog.md +++ b/versioned_docs/version-4.x/lakehouse/catalogs/hudi-catalog.md @@ -51,6 +51,70 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | Whether to use the partition information already synchronized by Hive Metastore. If true, partition information will be obtained directly from Hive Metastore. Otherwise, it will be obtained from the metadata file of the file system. Obtaining information from Hive Metastore is more efficient, but users need to ensure that the latest metadata has been synchronized to Hive Metastore. | false | +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Hudi metadata. Metadata includes table structure (Schema), partition information, FS View, and Meta Client objects. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Hudi-related external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Hudi Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.hudi.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition` | `meta.cache.hudi.partition.` | Caches Hudi partition-related metadata. Impact: Used for partition discovery and pruning. | +| `fs_view` | `meta.cache.hudi.fs_view.` | Caches Hudi filesystem view related metadata. | +| `meta_client` | `meta.cache.hudi.meta_client.` | Caches Hudi Meta Client objects. Impact: Reduces redundant loading of Hudi metadata. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.hudi.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest data changes or schema changes for Hudi tables, you can set the `ttl-second` for `schema` or `partition` to `0`. + ```sql + -- Disable partition metadata cache to detect the latest partition changes in Hudi tables + ALTER CATALOG hudi_ctl SET PROPERTIES ("meta.cache.hudi.partition.ttl-second" = "0"); + ``` +* **Performance optimization**: Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Hudi (via the HMS catalog property update path). + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hudi_ctl' AND engine_name = 'hudi' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Hudi Versions The current dependent Hudi version is 0.15. It is recommended to access Hudi data version 0.14 and above. diff --git a/versioned_docs/version-4.x/lakehouse/catalogs/iceberg-catalog.mdx b/versioned_docs/version-4.x/lakehouse/catalogs/iceberg-catalog.mdx index ce3418e3cc1c3..418497db93327 100644 --- a/versioned_docs/version-4.x/lakehouse/catalogs/iceberg-catalog.mdx +++ b/versioned_docs/version-4.x/lakehouse/catalogs/iceberg-catalog.mdx @@ -85,6 +85,78 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering general properties. See the [Catalog Overview](../catalog-overview.md) for details on common properties. +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Iceberg metadata. Metadata includes table structure (Schema), table objects, view objects, and manifest details. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Iceberg Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Iceberg Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `table` | `meta.cache.iceberg.table.` | Caches Iceberg table metadata objects. Impact: Reduces Catalog/Metastore round-trips. | +| `view` | `meta.cache.iceberg.view.` | Caches Iceberg View metadata objects. | +| `manifest` | `meta.cache.iceberg.manifest.` | Caches manifest details. Impact: Reduces repeated manifest access overhead. Note: This module is disabled by default and must be enabled manually. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.iceberg.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest snapshots or schema changes for Iceberg tables, you can set the `ttl-second` for `schema` or `table` to `0`. + ```sql + -- Disable table object cache to detect snapshot changes + ALTER CATALOG iceberg_ctl SET PROPERTIES ("meta.cache.iceberg.table.ttl-second" = "0"); + ``` +* **Performance optimization**: + * Enabling manifest cache can significantly speed up query planning for large tables: + ```sql + ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", + "meta.cache.iceberg.manifest.ttl-second" = "600" + ); + ``` + * Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Iceberg Catalog. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'iceberg_ctl' AND engine_name = 'iceberg' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Iceberg Versions | Doris Version | Iceberg SDK Version | diff --git a/versioned_docs/version-4.x/lakehouse/catalogs/maxcompute-catalog.md b/versioned_docs/version-4.x/lakehouse/catalogs/maxcompute-catalog.md index e611f96f12f0e..4cf40f7f35633 100644 --- a/versioned_docs/version-4.x/lakehouse/catalogs/maxcompute-catalog.md +++ b/versioned_docs/version-4.x/lakehouse/catalogs/maxcompute-catalog.md @@ -111,6 +111,68 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the "Common Properties" section in [Catalog Overview](../catalog-overview.md). +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches MaxCompute metadata. Metadata includes table structure (Schema) and partition lists. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, MaxCompute Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +MaxCompute Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.maxcompute.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | Caches partition value lists. Impact: Partition pruning and enumeration. If disabled, new external partitions can be seen in real-time. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.maxcompute.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest partition or schema changes for MaxCompute tables, you can set the `ttl-second` for `schema` or `partition_values` to `0`. + ```sql + -- Disable partition value cache to detect the latest partitions in MaxCompute tables + ALTER CATALOG mc_ctl SET PROPERTIES ("meta.cache.maxcompute.partition_values.ttl-second" = "0"); + ``` +* **Note**: `meta.cache.maxcompute.*` currently does not have a dedicated hot-reload hook. After changing the configuration, it is recommended to recreate the Catalog or restart FE to ensure it takes effect. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'mc_ctl' AND engine_name = 'maxcompute' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported MaxCompute Versions Only the public cloud version of MaxCompute is supported. For private cloud version support, please contact Doris community support. diff --git a/versioned_docs/version-4.x/lakehouse/catalogs/paimon-catalog.mdx b/versioned_docs/version-4.x/lakehouse/catalogs/paimon-catalog.mdx index dca01450d13ba..087bc4dcd5c71 100644 --- a/versioned_docs/version-4.x/lakehouse/catalogs/paimon-catalog.mdx +++ b/versioned_docs/version-4.x/lakehouse/catalogs/paimon-catalog.mdx @@ -90,6 +90,68 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the [Catalog Overview](../catalog-overview.md) section on [Common Properties]. +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Paimon metadata. Metadata includes table structure (Schema) and table objects. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Paimon Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Cache Property Configuration (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Paimon Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.paimon.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `table` | `meta.cache.paimon.table.` | Caches Paimon table metadata objects. Impact: Reduces metadata loading overhead during query planning. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.paimon.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest data changes or schema changes for Paimon tables, you can set the `ttl-second` for `schema` or `table` to `0`. + ```sql + -- Disable table object cache to detect the latest snapshots of Paimon tables + ALTER CATALOG paimon_ctl SET PROPERTIES ("meta.cache.paimon.table.ttl-second" = "0"); + ``` +* **Performance optimization**: Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Paimon Catalog. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'paimon_ctl' AND engine_name = 'paimon' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Paimon Versions The currently dependent Paimon version is 1.0.0. @@ -118,6 +180,8 @@ The currently dependent Paimon version is 1.0.0. * [Google Cloud Storage](../storages/gcs.md) +* [Apache Ozone](../storages/ozone.md) (supported since 4.0.4) + * [Alibaba Cloud OSS](../storages/aliyun-oss.md) * [Tencent Cloud COS](../storages/tencent-cos.md) @@ -265,6 +329,23 @@ Supported since version 4.0.3, `timestamp_with_local_time_zone` can be mapped to ); ``` + + Supported since 4.0.4 + ```sql + CREATE CATALOG paimon_hms_on_ozone_catalog PROPERTIES ( + 'type' = 'paimon', + 'paimon.catalog.type' = 'hms', + 'warehouse' = 's3a://test-bucket/paimon-warehouse', + 'hive.metastore.uris' = 'thrift://127.0.0.1:9383', + 'fs.ozone.support' = 'true', + 'ozone.endpoint' = 'http://ozone-s3g:9878', + 'ozone.access_key' = '', + 'ozone.secret_key' = '', + 'ozone.region' = 'us-east-1', + 'ozone.use_path_style' = 'true' + ); + ``` + ```sql CREATE CATALOG paimon_hms_on_minio_catalog PROPERTIES ( From 68381a72a0862ed5e3cfe6c1fd953849776b3a85 Mon Sep 17 00:00:00 2001 From: "Mingyu Chen (Rayner)" Date: Mon, 30 Mar 2026 17:25:51 -0700 Subject: [PATCH 7/8] fix --- docs/lakehouse/catalog-overview.md | 2 +- .../current/lakehouse/catalog-overview.md | 2 +- .../version-3.x/lakehouse/catalogs/hive-catalog.mdx | 2 -- .../version-3.x/lakehouse/catalogs/paimon-catalog.mdx | 2 -- .../version-4.x/lakehouse/catalog-overview.md | 2 +- versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx | 1 - .../version-3.x/lakehouse/catalogs/paimon-catalog.mdx | 2 -- versioned_docs/version-4.x/lakehouse/catalog-overview.md | 2 +- 8 files changed, 4 insertions(+), 11 deletions(-) diff --git a/docs/lakehouse/catalog-overview.md b/docs/lakehouse/catalog-overview.md index 0e798e72a00ee..9a89080ed3297 100644 --- a/docs/lakehouse/catalog-overview.md +++ b/docs/lakehouse/catalog-overview.md @@ -326,7 +326,7 @@ Doris also supports disabling metadata caching to enable real-time access to the - Before Doris 4.1.x: please refer to [Metadata Cache](./meta-cache.md). - Doris 4.1.x and later: please refer to the "Metadata Cache" section in each Catalog documentation. - - [Hive Catalog](./catalogs/hive-catalog.md#meta-cache) + - [Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache) - [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache) - [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache) - [Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md index 713fda0620664..b82a4c2d5dfa7 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md @@ -326,7 +326,7 @@ Doris 也支持关闭元数据缓存,以便能够实时访问到最新的元 - Doris 4.1.x 之前:请参阅[元数据缓存](./meta-cache.md)。 - Doris 4.1.x 及之后:请参阅各 Catalog 文档中的“元数据缓存”章节。 - - [Hive Catalog](./catalogs/hive-catalog.md#meta-cache) + - [Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache) - [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache) - [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache) - [Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx index c1f7fd06362c9..068326fc83073 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx @@ -188,8 +188,6 @@ ORDER BY entry_name; * [Azure Blob](../storages/azure-blob.md) -* [Apache Ozone](../storages/ozone.md)(自 4.0.4 起支持) - * [阿里云 OSS](../storages/aliyun-oss.md) * [腾讯云 COS](../storages/tencent-cos.md) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx index 227223021b55f..f160ca3a59c45 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx @@ -183,8 +183,6 @@ ORDER BY entry_name; * [Google Cloud Storage](../storages/gcs.md) -* [Apache Ozone](../storages/ozone.md)(自 4.0.4 起支持) - * [阿里云 OSS](../storages/aliyun-oss.md) * [腾讯云 COS](../storages/tencent-cos.md) diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalog-overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalog-overview.md index 713fda0620664..b82a4c2d5dfa7 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalog-overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/lakehouse/catalog-overview.md @@ -326,7 +326,7 @@ Doris 也支持关闭元数据缓存,以便能够实时访问到最新的元 - Doris 4.1.x 之前:请参阅[元数据缓存](./meta-cache.md)。 - Doris 4.1.x 及之后:请参阅各 Catalog 文档中的“元数据缓存”章节。 - - [Hive Catalog](./catalogs/hive-catalog.md#meta-cache) + - [Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache) - [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache) - [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache) - [Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache) diff --git a/versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx b/versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx index c203e44e468dd..7ee283ff848b1 100644 --- a/versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx +++ b/versioned_docs/version-3.x/lakehouse/catalogs/hive-catalog.mdx @@ -176,7 +176,6 @@ Hive transactional tables are supported from version 3.x onwards. For details, r * [AWS S3](../storages/s3.md) * [Google Cloud Storage](../storages/gcs.md) * [Azure Blob](../storages/azure-blob.md) -* [Apache Ozone](../storages/ozone.md) (supported since 4.0.4) * [Alibaba Cloud OSS](../storages/aliyun-oss.md) * [Tencent Cloud COS](../storages/tencent-cos.md) * [Huawei Cloud OBS](../storages/huawei-obs.md) diff --git a/versioned_docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx b/versioned_docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx index 087bc4dcd5c71..b080aaee1eb20 100644 --- a/versioned_docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx +++ b/versioned_docs/version-3.x/lakehouse/catalogs/paimon-catalog.mdx @@ -180,8 +180,6 @@ The currently dependent Paimon version is 1.0.0. * [Google Cloud Storage](../storages/gcs.md) -* [Apache Ozone](../storages/ozone.md) (supported since 4.0.4) - * [Alibaba Cloud OSS](../storages/aliyun-oss.md) * [Tencent Cloud COS](../storages/tencent-cos.md) diff --git a/versioned_docs/version-4.x/lakehouse/catalog-overview.md b/versioned_docs/version-4.x/lakehouse/catalog-overview.md index 0e798e72a00ee..9a89080ed3297 100644 --- a/versioned_docs/version-4.x/lakehouse/catalog-overview.md +++ b/versioned_docs/version-4.x/lakehouse/catalog-overview.md @@ -326,7 +326,7 @@ Doris also supports disabling metadata caching to enable real-time access to the - Before Doris 4.1.x: please refer to [Metadata Cache](./meta-cache.md). - Doris 4.1.x and later: please refer to the "Metadata Cache" section in each Catalog documentation. - - [Hive Catalog](./catalogs/hive-catalog.md#meta-cache) + - [Hive Catalog](./catalogs/hive-catalog.mdx#meta-cache) - [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache) - [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache) - [Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache) From ea5a5cbd3e2c0af62c724266e64261eeca196847 Mon Sep 17 00:00:00 2001 From: "Mingyu Chen (Rayner)" Date: Mon, 30 Mar 2026 17:29:24 -0700 Subject: [PATCH 8/8] fix --- sidebars.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/sidebars.ts b/sidebars.ts index 9517e108c0942..48c1d55f75218 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -455,7 +455,6 @@ const sidebars: SidebarsConfig = { }, 'lakehouse/data-cache', 'lakehouse/meta-cache', - 'lakehouse/meta-cache/unified-meta-cache', 'lakehouse/compute-node', 'lakehouse/statistics', {