From 371a925c812ac1f218f8f9118d905891380de2a2 Mon Sep 17 00:00:00 2001 From: natoverse Date: Thu, 22 Jan 2026 16:29:16 -0800 Subject: [PATCH 1/5] Fix output property name --- packages/graphrag/graphrag/cli/query.py | 2 +- .../graphrag/graphrag/config/models/graph_rag_config.py | 9 ++++----- packages/graphrag/graphrag/index/run/run_pipeline.py | 2 +- packages/graphrag/graphrag/index/run/utils.py | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/packages/graphrag/graphrag/cli/query.py b/packages/graphrag/graphrag/cli/query.py index 6ce049ddcc..7b2dbb50aa 100644 --- a/packages/graphrag/graphrag/cli/query.py +++ b/packages/graphrag/graphrag/cli/query.py @@ -377,7 +377,7 @@ def _resolve_output_files( ) -> dict[str, Any]: """Read indexing output files to a dataframe dict.""" dataframe_dict = {} - storage_obj = create_storage(config.output) + storage_obj = create_storage(config.output_storage) for name in output_list: df_value = asyncio.run(load_table_from_storage(name=name, storage=storage_obj)) dataframe_dict[name] = df_value diff --git a/packages/graphrag/graphrag/config/models/graph_rag_config.py b/packages/graphrag/graphrag/config/models/graph_rag_config.py index 07a7eba074..f7dac388b1 100644 --- a/packages/graphrag/graphrag/config/models/graph_rag_config.py +++ b/packages/graphrag/graphrag/config/models/graph_rag_config.py @@ -102,7 +102,7 @@ def _validate_input_base_dir(self) -> None: ) """The chunking configuration to use.""" - output: StorageConfig = Field( + output_storage: StorageConfig = Field( description="The output configuration.", default=StorageConfig( base_dir=graphrag_config_defaults.output_storage.base_dir, @@ -112,12 +112,11 @@ def _validate_input_base_dir(self) -> None: def _validate_output_base_dir(self) -> None: """Validate the output base directory.""" - if self.output.type == StorageType.File: - if not self.output.base_dir: + if self.output_storage.type == StorageType.File: + if not self.output_storage.base_dir: msg = "output base directory is required for file output. Please rerun `graphrag init` and set the output configuration." raise ValueError(msg) - self.output.base_dir = str(Path(self.output.base_dir).resolve()) - + self.output_storage.base_dir = str(Path(self.output_storage.base_dir).resolve()) update_output_storage: StorageConfig = Field( description="The output configuration for the updated index.", default=StorageConfig( diff --git a/packages/graphrag/graphrag/index/run/run_pipeline.py b/packages/graphrag/graphrag/index/run/run_pipeline.py index d552acecac..a4ce17582c 100644 --- a/packages/graphrag/graphrag/index/run/run_pipeline.py +++ b/packages/graphrag/graphrag/index/run/run_pipeline.py @@ -36,7 +36,7 @@ async def run_pipeline( ) -> AsyncIterable[PipelineRunResult]: """Run all workflows using a simplified pipeline.""" input_storage = create_storage(config.input_storage) - output_storage = create_storage(config.output) + output_storage = create_storage(config.output_storage) cache = create_cache(config.cache) # load existing state in case any workflows are stateful diff --git a/packages/graphrag/graphrag/index/run/utils.py b/packages/graphrag/graphrag/index/run/utils.py index 040b77a45c..be6914a6d6 100644 --- a/packages/graphrag/graphrag/index/run/utils.py +++ b/packages/graphrag/graphrag/index/run/utils.py @@ -52,7 +52,7 @@ def get_update_storages( config: GraphRagConfig, timestamp: str ) -> tuple[Storage, Storage, Storage]: """Get storage objects for the update index run.""" - output_storage = create_storage(config.output) + output_storage = create_storage(config.output_storage) update_storage = create_storage(config.update_output_storage) timestamped_storage = update_storage.child(timestamp) delta_storage = timestamped_storage.child("delta") From 4a800df03e707f12854ff883808a6265ffdc8cd0 Mon Sep 17 00:00:00 2001 From: natoverse Date: Thu, 22 Jan 2026 16:30:03 -0800 Subject: [PATCH 2/5] Remove retry defaults from init --- packages/graphrag/graphrag/config/init_content.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/packages/graphrag/graphrag/config/init_content.py b/packages/graphrag/graphrag/config/init_content.py index fd0f5aa70e..9973d1920f 100644 --- a/packages/graphrag/graphrag/config/init_content.py +++ b/packages/graphrag/graphrag/config/init_content.py @@ -24,9 +24,6 @@ api_key: ${{GRAPHRAG_API_KEY}} # set this in the generated .env file, or remove if managed identity retry: type: exponential_backoff - base_delay: 2.0 - max_retries: 7 - jitter: true embedding_models: {defs.DEFAULT_EMBEDDING_MODEL_ID}: @@ -36,9 +33,6 @@ api_key: ${{GRAPHRAG_API_KEY}} retry: type: exponential_backoff - base_delay: 2.0 - max_retries: 7 - jitter: true ### Document processing settings ### From b04ea2c17853667dc6f5c58274aca40e9bf646ab Mon Sep 17 00:00:00 2001 From: natoverse Date: Thu, 22 Jan 2026 16:39:40 -0800 Subject: [PATCH 3/5] Fix model deployment validation --- .../graphrag_llm/config/model_config.py | 15 ++++++++------- .../graphrag/config/models/graph_rag_config.py | 5 ++++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/packages/graphrag-llm/graphrag_llm/config/model_config.py b/packages/graphrag-llm/graphrag_llm/config/model_config.py index 1b70f61508..ef4d4bfb30 100644 --- a/packages/graphrag-llm/graphrag_llm/config/model_config.py +++ b/packages/graphrag-llm/graphrag_llm/config/model_config.py @@ -3,6 +3,7 @@ """Language model configuration.""" +import logging from typing import Any from pydantic import BaseModel, ConfigDict, Field, model_validator @@ -12,6 +13,8 @@ from graphrag_llm.config.retry_config import RetryConfig from graphrag_llm.config.types import AuthMethod, LLMProviderType +logger = logging.getLogger(__name__) + class ModelConfig(BaseModel): """Configuration for a language model.""" @@ -84,15 +87,13 @@ class ModelConfig(BaseModel): def _validate_lite_llm_config(self) -> None: """Validate LiteLLM specific configuration.""" - if self.model_provider == "azure" and ( - not self.azure_deployment_name or not self.api_base - ): - msg = "azure_deployment_name and api_base must be specified with the 'azure' model provider." + if self.model_provider == "azure" and not self.api_base: + msg = "api_base must be specified with the 'azure' model provider." raise ValueError(msg) - if self.model_provider != "azure" and self.azure_deployment_name: - msg = "azure_deployment_name should not be specified for non-Azure model providers." - raise ValueError(msg) + if self.model_provider == "azure" and not self.azure_deployment_name: + msg = "azure_deployment_name is not specified and will default to model name. If your deployment name differs (uncommon), API calls will fail." + logger.warning(msg) if self.auth_method == AuthMethod.AzureManagedIdentity: if self.api_key is not None: diff --git a/packages/graphrag/graphrag/config/models/graph_rag_config.py b/packages/graphrag/graphrag/config/models/graph_rag_config.py index f7dac388b1..84fb2de884 100644 --- a/packages/graphrag/graphrag/config/models/graph_rag_config.py +++ b/packages/graphrag/graphrag/config/models/graph_rag_config.py @@ -116,7 +116,10 @@ def _validate_output_base_dir(self) -> None: if not self.output_storage.base_dir: msg = "output base directory is required for file output. Please rerun `graphrag init` and set the output configuration." raise ValueError(msg) - self.output_storage.base_dir = str(Path(self.output_storage.base_dir).resolve()) + self.output_storage.base_dir = str( + Path(self.output_storage.base_dir).resolve() + ) + update_output_storage: StorageConfig = Field( description="The output configuration for the updated index.", default=StorageConfig( From 2f9dda034bcf4691d3da0f5f3a56cc112cce4024 Mon Sep 17 00:00:00 2001 From: natoverse Date: Thu, 22 Jan 2026 16:54:52 -0800 Subject: [PATCH 4/5] Update migration notebook --- .../index_migration_to_v3.ipynb | 105 +++++++----------- 1 file changed, 38 insertions(+), 67 deletions(-) diff --git a/docs/examples_notebooks/index_migration_to_v3.ipynb b/docs/examples_notebooks/index_migration_to_v3.ipynb index 2b76133e80..a0e50be432 100644 --- a/docs/examples_notebooks/index_migration_to_v3.ipynb +++ b/docs/examples_notebooks/index_migration_to_v3.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -18,45 +18,40 @@ "\n", "This notebook is used to maintain data model parity with older indexes for version 3.0 of GraphRAG. If you have a pre-3.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-2.0 index, please run the v2 migration notebook first!\n", "\n", - "NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration.\n", - "\n", - "This notebook will also update your settings.yaml to ensure compatibility with our newer vector store collection naming scheme in order to avoid re-ingesting.\n", + "NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. The config changes from v2 to v3 are significant in places!\n", "\n", "WARNING: This will overwrite your parquet files, you may want to make a backup!" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This is the directory that has your settings.yaml\n", - "PROJECT_DIRECTORY = \"/Users/naevans/graphrag/working/migration\"" + "PROJECT_DIRECTORY = \"\"" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", - "from graphrag.config.load_config import load_config\n", - "from graphrag.storage.factory import StorageFactory\n", + "from graphrag.config.models.graph_rag_config import GraphRagConfig\n", + "from graphrag_common.config import load_config\n", + "from graphrag_storage.storage_factory import create_storage\n", "\n", - "config = load_config(Path(PROJECT_DIRECTORY))\n", - "storage_config = config.output.model_dump()\n", - "storage = StorageFactory().create_storage(\n", - " storage_type=storage_config[\"type\"],\n", - " kwargs=storage_config,\n", - ")" + "config = load_config(GraphRagConfig, config_path=Path(PROJECT_DIRECTORY))\n", + "storage = create_storage(config.output_storage)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -67,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -89,71 +84,47 @@ "metadata": {}, "source": [ "## Update settings.yaml\n", - "This next section will attempt to insert index names for each vector index using our new schema structure. It depends on most things being default. If you have already customized your vector store schema it may not be necessary.\n", - "\n", - "The primary goal is to align v2 indexes using our old default naming schema with the new customizability. If don't need this done or you have a more complicated config, comment it out and update your config manually to ensure each index name is set.\n", + "If you have left the default settings for your vector store schema, you may need to set explicit values that map each embedding type to a vector schema name. If you have already customized your vector store schema it may not be necessary.\n", "\n", "Old default index names:\n", "- default-text_unit-text\n", "- default-entity-description\n", "- default-community-full_content\n", "\n", + "(if you left all of the defaults, check your output/lancedb folder to confirm the above)\n", + "\n", "v3 versions are:\n", "- text_unit_text\n", "- entity_description\n", "- community_full_content\n", "\n", - "Therefore, with a v2 index we will explicitly set the old index names so it connects correctly.\n", - "\n", - "NOTE: we are also setting the default vector_size for each index, under the assumption that you are using a prior default with 1536 dimensions. Our new default of text-embedding-3-large has 3072 dimensions, which will be populated as the default if unset. Again, if you have a more complicated situation you may want to manually configure this.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import yaml\n", - "\n", - "EMBEDDING_DIMENSIONS = 1536\n", - "\n", - "settings = Path(PROJECT_DIRECTORY) / \"settings.yaml\"\n", - "with Path.open(settings) as f:\n", - " conf = yaml.safe_load(f)\n", - "\n", - "vector_store = conf.get(\"vector_store\", {})\n", - "container_name = vector_store.get(\"container_name\", \"default\")\n", - "embeddings_schema = vector_store.get(\"embeddings_schema\", {})\n", - "text_unit_schema = embeddings_schema.get(\"text_unit.text\", {})\n", - "if \"index_name\" not in text_unit_schema:\n", - " text_unit_schema[\"index_name\"] = f\"{container_name}-text_unit-text\"\n", - "if \"vector_size\" not in text_unit_schema:\n", - " text_unit_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n", - "embeddings_schema[\"text_unit.text\"] = text_unit_schema\n", - "entity_schema = embeddings_schema.get(\"entity.description\", {})\n", - "if \"index_name\" not in entity_schema:\n", - " entity_schema[\"index_name\"] = f\"{container_name}-entity-description\"\n", - "if \"vector_size\" not in entity_schema:\n", - " entity_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n", - "embeddings_schema[\"entity.description\"] = entity_schema\n", - "community_schema = embeddings_schema.get(\"community.full_content\", {})\n", - "if \"index_name\" not in community_schema:\n", - " community_schema[\"index_name\"] = f\"{container_name}-community-full_content\"\n", - "if \"vector_size\" not in community_schema:\n", - " community_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n", - "embeddings_schema[\"community.full_content\"] = community_schema\n", - "vector_store[\"embeddings_schema\"] = embeddings_schema\n", - "conf[\"vector_store\"] = vector_store\n", - "\n", - "with Path.open(settings, \"w\") as f:\n", - " yaml.safe_dump(conf, f)" + "Therefore, with a v2 index need to explicitly set the old index names so it connects correctly. We no longer support the \"prefix\" - you can just set an explicit index_name for each embedding.\n", + "\n", + "NOTE: we are also setting the default vector_size for each index below, under the assumption that you are using a prior default with 1536 dimensions. Our new default of text-embedding-3-large has 3072 dimensions, which will be populated as the default if unset. Again, if you have a more complicated situation you may want to manually configure this.\n", + "\n", + "Here is an example of the new vector store config block that you may need in your settings.yaml:\n", + "\n", + "```yaml\n", + "vector_store:\n", + " type: lancedb\n", + " db_uri: output/lancedb\n", + " index_schema:\n", + " text_unit_text:\n", + " index_name: default-text_unit-text\n", + " vector_size: 1536\n", + " entity_description:\n", + " index_name: default-entity-description\n", + " vector_size: 1536\n", + " community_full_content:\n", + " index_name: default-community-full_content\n", + " vector_size: 1536\n", + "```\n" ] } ], "metadata": { "kernelspec": { - "display_name": "graphrag", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -167,7 +138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.12.3" } }, "nbformat": 4, From 8e7f9f8c4519c82edaff1f7b883a569a109662cd Mon Sep 17 00:00:00 2001 From: natoverse Date: Thu, 22 Jan 2026 20:15:52 -0800 Subject: [PATCH 5/5] Fix tests and formatting --- docs/examples_notebooks/api_overview.ipynb | 5 +++-- docs/examples_notebooks/input_documents.ipynb | 5 +++-- packages/graphrag/graphrag/cli/query.py | 8 ++++---- tests/unit/config/test_config.py | 4 ++-- tests/unit/config/test_model_config.py | 15 ++------------- tests/unit/config/utils.py | 2 +- unified-search-app/app/app_logic.py | 3 ++- 7 files changed, 17 insertions(+), 25 deletions(-) diff --git a/docs/examples_notebooks/api_overview.ipynb b/docs/examples_notebooks/api_overview.ipynb index 2a0c0f15de..abcd7832fc 100644 --- a/docs/examples_notebooks/api_overview.ipynb +++ b/docs/examples_notebooks/api_overview.ipynb @@ -28,10 +28,11 @@ "from pathlib import Path\n", "from pprint import pprint\n", "\n", - "import graphrag.api as api\n", "import pandas as pd\n", "from graphrag.config.load_config import load_config\n", - "from graphrag.index.typing.pipeline_run_result import PipelineRunResult" + "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n", + "\n", + "import graphrag.api as api" ] }, { diff --git a/docs/examples_notebooks/input_documents.ipynb b/docs/examples_notebooks/input_documents.ipynb index 5657770eaf..505c0fe1f3 100644 --- a/docs/examples_notebooks/input_documents.ipynb +++ b/docs/examples_notebooks/input_documents.ipynb @@ -30,10 +30,11 @@ "from pathlib import Path\n", "from pprint import pprint\n", "\n", - "import graphrag.api as api\n", "import pandas as pd\n", "from graphrag.config.load_config import load_config\n", - "from graphrag.index.typing.pipeline_run_result import PipelineRunResult" + "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n", + "\n", + "import graphrag.api as api" ] }, { diff --git a/packages/graphrag/graphrag/cli/query.py b/packages/graphrag/graphrag/cli/query.py index 7b2dbb50aa..ae06a88c95 100644 --- a/packages/graphrag/graphrag/cli/query.py +++ b/packages/graphrag/graphrag/cli/query.py @@ -38,7 +38,7 @@ def run_global_search( """ cli_overrides: dict[str, Any] = {} if data_dir: - cli_overrides["output"] = {"base_dir": str(data_dir)} + cli_overrides["output_storage"] = {"base_dir": str(data_dir)} config = load_config( root_dir=root_dir, cli_overrides=cli_overrides, @@ -124,7 +124,7 @@ def run_local_search( """ cli_overrides: dict[str, Any] = {} if data_dir: - cli_overrides["output"] = {"base_dir": str(data_dir)} + cli_overrides["output_storage"] = {"base_dir": str(data_dir)} config = load_config( root_dir=root_dir, cli_overrides=cli_overrides, @@ -221,7 +221,7 @@ def run_drift_search( """ cli_overrides: dict[str, Any] = {} if data_dir: - cli_overrides["output"] = {"base_dir": str(data_dir)} + cli_overrides["output_storage"] = {"base_dir": str(data_dir)} config = load_config( root_dir=root_dir, cli_overrides=cli_overrides, @@ -312,7 +312,7 @@ def run_basic_search( """ cli_overrides: dict[str, Any] = {} if data_dir: - cli_overrides["output"] = {"base_dir": str(data_dir)} + cli_overrides["output_storage"] = {"base_dir": str(data_dir)} config = load_config( root_dir=root_dir, cli_overrides=cli_overrides, diff --git a/tests/unit/config/test_config.py b/tests/unit/config/test_config.py index 67962673e4..75a472f023 100644 --- a/tests/unit/config/test_config.py +++ b/tests/unit/config/test_config.py @@ -49,11 +49,11 @@ def test_load_config_with_cli_overrides() -> None: output_dir = "some_output_dir" expected_output_base_dir = root_dir / output_dir expected = get_default_graphrag_config() - expected.output.base_dir = str(expected_output_base_dir) + expected.output_storage.base_dir = str(expected_output_base_dir) actual = load_config( root_dir=root_dir, - cli_overrides={"output": {"base_dir": output_dir}}, + cli_overrides={"output_storage": {"base_dir": output_dir}}, ) assert_graphrag_configs(actual, expected) # Need to reset cwd after test diff --git a/tests/unit/config/test_model_config.py b/tests/unit/config/test_model_config.py index 67de71bf7f..0419cc14bc 100644 --- a/tests/unit/config/test_model_config.py +++ b/tests/unit/config/test_model_config.py @@ -37,18 +37,7 @@ def test_litellm_provider_validation() -> None: with pytest.raises( ValueError, - match="azure_deployment_name should not be specified for non-Azure model providers\\.", - ): - _ = ModelConfig( - type=LLMProviderType.LiteLLM, - model_provider="openai", - model="gpt-4o", - azure_deployment_name="some-deployment", - ) - - with pytest.raises( - ValueError, - match="azure_deployment_name and api_base must be specified with the 'azure' model provider\\.", + match="api_base must be specified with the 'azure' model provider\\.", ): _ = ModelConfig( type=LLMProviderType.LiteLLM, @@ -58,7 +47,7 @@ def test_litellm_provider_validation() -> None: with pytest.raises( ValueError, - match="azure_deployment_name and api_base must be specified with the 'azure' model provider\\.", + match="api_base must be specified with the 'azure' model provider\\.", ): _ = ModelConfig( type=LLMProviderType.LiteLLM, diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py index 911c2b62dd..ae898d4568 100644 --- a/tests/unit/config/utils.py +++ b/tests/unit/config/utils.py @@ -351,7 +351,7 @@ def assert_graphrag_configs(actual: GraphRagConfig, expected: GraphRagConfig) -> assert_vector_store_configs(actual.vector_store, expected.vector_store) assert_reporting_configs(actual.reporting, expected.reporting) - assert_storage_config(actual.output, expected.output) + assert_storage_config(actual.output_storage, expected.output_storage) assert_storage_config(actual.input_storage, expected.input_storage) assert_storage_config(actual.update_output_storage, expected.update_output_storage) diff --git a/unified-search-app/app/app_logic.py b/unified-search-app/app/app_logic.py index dc64e0e77c..a573b9daa5 100644 --- a/unified-search-app/app/app_logic.py +++ b/unified-search-app/app/app_logic.py @@ -7,7 +7,6 @@ import logging from typing import TYPE_CHECKING -import graphrag.api as api import streamlit as st from knowledge_loader.data_sources.loader import ( create_datasource, @@ -18,6 +17,8 @@ from state.session_variables import SessionVariables from ui.search import display_search_result +import graphrag.api as api + if TYPE_CHECKING: import pandas as pd