diff --git a/README.md b/README.md index 685e37a47..7d82ab988 100644 --- a/README.md +++ b/README.md @@ -471,7 +471,13 @@ To list the options for zvec, execute vectordbbench zvec --help ### Run Doris from command line -Doris supports ann index with type hnsw from version 4.0.x +Doris supports ANN indexes from version 4.0.x. VectorDBBench passes Doris index properties through with `--index-prop key=value`, so newly added Doris index properties normally do not require a VectorDBBench code change. + +By default, VectorDBBench creates an HNSW index. Use `--index-prop index_type=` to select another Doris ANN index type, and pass index-specific properties with additional `--index-prop` options. For example, IVF and IVF on disk indexes require `nlist`: + +```shell +NUM_PER_BATCH=1000000 vectordbbench doris --http-port=8030 --port=9030 --db-name=vector_test --case-type=Performance768D1M --stream-load-rows-per-batch=500000 --index-prop index_type=ivf_on_disk --index-prop nlist=1024 +``` ```shell NUM_PER_BATCH=1000000 vectordbbench doris --http-port=8030 --port=9030 --db-name=vector_test --case-type=Performance768D1M --stream-load-rows-per-batch=500000 @@ -482,11 +488,9 @@ Using flag `--session-var`, if you want to test doris with some customized sessi NUM_PER_BATCH=1000000 vectordbbench doris --http-port=8030 --port=9030 --db-name=vector_test --case-type=Performance768D1M --stream-load-rows-per-batch=500000 --session-var enable_profile=True ``` -Mote options: +More options: ```text ---m INTEGER hnsw m ---ef-construction INTEGER hnsw ef-construction --username TEXT Username [default: root; required] --password TEXT Password [default: ""] --host TEXT Db host [default: 127.0.0.1; required] @@ -496,9 +500,10 @@ Mote options: --ssl / --no-ssl Enable or disable SSL, for Doris Serverless SSL must be enabled [default: no-ssl] --index-prop TEXT Extra index PROPERTY as key=value - (repeatable) + (repeatable or comma-separated, for example + index_type=ivf_on_disk,nlist=1024) --session-var TEXT Session variable key=value applied to each - SQL session (repeatable) + SQL session (repeatable or comma-separated) --stream-load-rows-per-batch INTEGER Rows per single stream load request; default uses NUM_PER_BATCH diff --git a/vectordb_bench/backend/clients/doris/cli.py b/vectordb_bench/backend/clients/doris/cli.py index 8153b412d..74a532bdc 100644 --- a/vectordb_bench/backend/clients/doris/cli.py +++ b/vectordb_bench/backend/clients/doris/cli.py @@ -7,7 +7,6 @@ from ....cli.cli import ( CommonTypedDict, - HNSWBaseTypedDict, cli, click_parameter_decorators_from_typed_dict, run, @@ -42,7 +41,7 @@ def _parse_kv_list(_ctx, _param, values): # noqa: ANN001 return parsed -class DorisTypedDict(CommonTypedDict, HNSWBaseTypedDict): +class DorisTypedDict(CommonTypedDict): user_name: Annotated[ str, click.option( @@ -166,13 +165,8 @@ def Doris( ): from .config import DorisCaseConfig, DorisConfig - # Merge explicit HNSW params into index properties using Doris naming index_properties: dict[str, str] = {} index_properties.update(parameters.get("index_prop", {}) or {}) - if parameters.get("m") is not None: - index_properties.setdefault("max_degree", str(parameters["m"])) - if parameters.get("ef_construction") is not None: - index_properties.setdefault("ef_construction", str(parameters["ef_construction"])) session_vars: dict[str, str] = parameters.get("session_var", {}) or {} diff --git a/vectordb_bench/backend/clients/doris/config.py b/vectordb_bench/backend/clients/doris/config.py index 7c79ba728..ae730aefb 100644 --- a/vectordb_bench/backend/clients/doris/config.py +++ b/vectordb_bench/backend/clients/doris/config.py @@ -1,4 +1,5 @@ import logging +from typing import ClassVar from pydantic import BaseModel, SecretStr, model_validator @@ -49,6 +50,11 @@ class DorisCaseConfig(BaseModel, DBCaseConfig): # Create table without ANN index no_index: bool = False + REQUIRED_INDEX_PARAMS_BY_TYPE: ClassVar[dict[str, tuple[str, ...]]] = { + "ivf": ("nlist",), + "ivf_on_disk": ("nlist",), + } + def get_metric_fn(self) -> str: if self.metric_type == MetricType.L2: return "l2_distance_approximate" @@ -65,30 +71,43 @@ def index_param(self) -> dict: metric_type = self.get_metric_fn() if metric_type.endswith("_approximate"): metric_type = metric_type[: -len("_approximate")] - props = {"metric_type": metric_type} + props: dict[str, str] = {"metric_type": metric_type} + + # Merge user provided index_properties first; convenience fields fill missing values below. + if self.index_properties: + props.update({str(k): str(v) for k, v in self.index_properties.items()}) if self.index_type is not None: - props.setdefault("index_type", self.index_type) + props["index_type"] = self.index_type else: props.setdefault("index_type", "hnsw") - # Merge optional HNSW params - props["index_type"] = str.lower(props["index_type"]) - if props["index_type"] == "hnsw": + index_type = str(props["index_type"]).strip().lower() + if not index_type: + index_type = "hnsw" + props["index_type"] = index_type + + # Map convenience fields when index type is known. + if index_type == "hnsw": if self.m is not None: props.setdefault("max_degree", str(self.m)) if self.ef_construction is not None: props.setdefault("ef_construction", str(self.ef_construction)) - elif props["index_type"] == "ivf": + + if index_type in {"ivf", "ivf_on_disk"}: if self.nlist is not None: props.setdefault("nlist", str(self.nlist)) - else: - msg = f"Unsupported index type: {props['index_type']}" - raise ValueError(msg) - # Merge user provided index_properties - if self.index_properties: - props.update(self.index_properties) + # Validate only known required params; unknown index types are passed through. + required_params = self.REQUIRED_INDEX_PARAMS_BY_TYPE.get(index_type, ()) + for param in required_params: + value = props.get(param) + if value is None or not str(value).strip(): + msg = f"{param} of ann index must be specified for {index_type} type" + raise ValueError(msg) + + if index_type not in {"hnsw", "ivf", "ivf_on_disk"}: + log.info("Passing through unknown Doris index_type without local validation: %s", index_type) return props def search_param(self) -> dict: diff --git a/vectordb_bench/backend/clients/doris/doris.py b/vectordb_bench/backend/clients/doris/doris.py index 01984d665..9837655d1 100644 --- a/vectordb_bench/backend/clients/doris/doris.py +++ b/vectordb_bench/backend/clients/doris/doris.py @@ -182,6 +182,34 @@ def _build_index_options(self) -> IndexOptions | None: else: not_applied[key] = value + # SDK's to_ann_properties does not auto-emit nlist for ivf_on_disk. + # Ensure nlist is forwarded through ANN passthrough properties. + ann_props_to_apply: dict[str, str] = {str(k): str(v) for k, v in not_applied.items()} + if str(index_param.get("index_type", "")).lower() == "ivf_on_disk" and "nlist" in index_param: + ann_props_to_apply.setdefault("nlist", str(index_param["nlist"])) + + if ann_props_to_apply: + applied_ann_key = None + for ann_key in ("ann_properties", "properties"): + if not hasattr(index_options, ann_key): + continue + try: + existing = getattr(index_options, ann_key, None) + merged_props = dict(existing) if isinstance(existing, dict) else {} + merged_props.update(ann_props_to_apply) + setattr(index_options, ann_key, merged_props) + applied[ann_key] = merged_props + applied_ann_key = ann_key + not_applied = {} + break + except Exception: + log.debug("Failed to set index_options.%s", ann_key, exc_info=True) + if applied_ann_key is None: + log.warning( + "Unable to attach ANN passthrough properties on IndexOptions: %s", + ann_props_to_apply, + ) + log.info( "Index options prepared: applied_props=%s not_applied_props=%s", applied, @@ -212,17 +240,20 @@ def _create_table_with_options(self, sample_data: pd.DataFrame, index_options: I self.table.index_options.metric_type = "inner_product" else: self.table.index_options.metric_type = "l2_distance" - if ( - index_options - and hasattr(index_options, "properties") - and isinstance(index_options.properties, dict) - ): - for key, value in index_options.properties.items(): - if hasattr(self.table.index_options, key): - try: - setattr(self.table.index_options, key, value) - except Exception: - log.debug("Skip setting index_options.%s at runtime", key) + if index_options: + runtime_props = {} + if hasattr(index_options, "ann_properties") and isinstance(index_options.ann_properties, dict): + runtime_props.update(index_options.ann_properties) + if hasattr(index_options, "properties") and isinstance(index_options.properties, dict): + runtime_props.update(index_options.properties) + + if runtime_props: + for key, value in runtime_props.items(): + if hasattr(self.table.index_options, key): + try: + setattr(self.table.index_options, key, value) + except Exception: + log.debug("Skip setting index_options.%s at runtime", key) except Exception: log.exception("Failed to adjust index options for table: %s", self.table_name)