From 4475a94ef01c1f5ac0f7d6fb437a1441f1b102bf Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 30 Dec 2025 19:00:44 +0200 Subject: [PATCH 01/30] Add block_simultaneous_read to DefaultStream --- .../concurrent_read_processor.py | 161 +++++- .../concurrent_source/concurrent_source.py | 4 +- .../declarative_component_schema.yaml | 11 + .../models/declarative_component_schema.py | 5 + .../parsers/model_to_component_factory.py | 1 + .../streams/concurrent/abstract_stream.py | 11 + .../streams/concurrent/default_stream.py | 7 + .../test_model_to_component_factory.py | 138 ++++++ .../test_concurrent_read_processor.py | 469 ++++++++++++++++++ 9 files changed, 792 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 8839018ca..6aaa368b4 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -66,17 +66,49 @@ def __init__( self._streams_done: Set[str] = set() self._exceptions_per_stream_name: dict[str, List[Exception]] = {} + # Track which streams (by name) are currently active + # A stream is "active" if it's generating partitions or has partitions being read + self._active_stream_names: Set[str] = set() + + # Store which streams require blocking simultaneous reads + self._stream_block_simultaneous_read: Dict[str, bool] = { + stream.name: stream.block_simultaneous_read for stream in stream_instances_to_read_from + } + + for stream in stream_instances_to_read_from: + if stream.block_simultaneous_read: + self._logger.info( + f"Stream '{stream.name}' has block_simultaneous_read=True. " + f"Will defer starting this stream if it or its parents are active." + ) + def on_partition_generation_completed( self, sentinel: PartitionGenerationCompletedSentinel ) -> Iterable[AirbyteMessage]: """ This method is called when a partition generation is completed. 1. Remove the stream from the list of streams currently generating partitions - 2. If the stream is done, mark it as such and return a stream status message - 3. If there are more streams to read from, start the next partition generator + 2. Deactivate parent streams (they were only needed for partition generation) + 3. If the stream is done, mark it as such and return a stream status message + 4. If there are more streams to read from, start the next partition generator """ stream_name = sentinel.stream.name self._streams_currently_generating_partitions.remove(sentinel.stream.name) + + # Deactivate all parent streams now that partition generation is complete + # Parents were only needed to generate slices, they can now be reused + parent_streams = self._collect_all_parent_stream_names(stream_name) + for parent_stream_name in parent_streams: + if parent_stream_name in self._active_stream_names: + self._logger.debug(f"Removing '{parent_stream_name}' from active streams") + self._active_stream_names.discard(parent_stream_name) + if self._stream_block_simultaneous_read.get(parent_stream_name, False): + self._logger.info( + f"Parent stream '{parent_stream_name}' deactivated after " + f"partition generation completed for child '{stream_name}'. " + f"Blocked streams in the queue will be retried on next start_next_partition_generator call." + ) + # It is possible for the stream to already be done if no partitions were generated # If the partition generation process was completed and there are no partitions left to process, the stream is done if ( @@ -183,24 +215,81 @@ def _flag_exception(self, stream_name: str, exception: Exception) -> None: def start_next_partition_generator(self) -> Optional[AirbyteMessage]: """ - Start the next partition generator. - 1. Pop the next stream to read from - 2. Submit the partition generator to the thread pool manager - 3. Add the stream to the list of streams currently generating partitions - 4. Return a stream status message + Submits the next partition generator to the thread pool. + + A stream will be deferred (moved to end of queue) if: + 1. The stream itself has block_simultaneous_read=True AND is already active + 2. Any parent stream has block_simultaneous_read=True AND is currently active + + This prevents simultaneous reads of streams that shouldn't be accessed concurrently. + + :return: A status message if a partition generator was started, otherwise None """ - if self._stream_instances_to_start_partition_generation: + if not self._stream_instances_to_start_partition_generation: + return None + + # Remember initial queue size to avoid infinite loops if all streams are blocked + max_attempts = len(self._stream_instances_to_start_partition_generation) + attempts = 0 + + while self._stream_instances_to_start_partition_generation and attempts < max_attempts: + attempts += 1 + + # Pop the first stream from the queue stream = self._stream_instances_to_start_partition_generation.pop(0) + stream_name = stream.name + + # Check if this stream has block_simultaneous_read and is already active + if self._stream_block_simultaneous_read.get(stream_name, False) and stream_name in self._active_stream_names: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' because it's already active " + f"(block_simultaneous_read=True). Trying next stream." + ) + continue # Try the next stream in the queue + + # Check if any parent streams have block_simultaneous_read and are currently active + parent_streams = self._collect_all_parent_stream_names(stream_name) + blocked_by_parents = [ + p for p in parent_streams + if self._stream_block_simultaneous_read.get(p, False) and p in self._active_stream_names + ] + + if blocked_by_parents: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' because parent stream(s) " + f"{blocked_by_parents} are active and have block_simultaneous_read=True. Trying next stream." + ) + continue # Try the next stream in the queue + + # No blocking - start this stream + # Mark stream as active before starting + self._active_stream_names.add(stream_name) + self._streams_currently_generating_partitions.append(stream_name) + + # Also mark all parent streams as active (they will be read from during partition generation) + parent_streams = self._collect_all_parent_stream_names(stream_name) + for parent_stream_name in parent_streams: + if self._stream_block_simultaneous_read.get(parent_stream_name, False): + self._active_stream_names.add(parent_stream_name) + self._logger.info( + f"Marking parent stream '{parent_stream_name}' as active " + f"(will be read during partition generation for '{stream_name}')" + ) + self._thread_pool_manager.submit(self._partition_enqueuer.generate_partitions, stream) - self._streams_currently_generating_partitions.append(stream.name) - self._logger.info(f"Marking stream {stream.name} as STARTED") - self._logger.info(f"Syncing stream: {stream.name} ") + self._logger.info(f"Marking stream {stream_name} as STARTED") + self._logger.info(f"Syncing stream: {stream_name}") return stream_status_as_airbyte_message( stream.as_airbyte_stream(), AirbyteStreamStatus.STARTED, ) - else: - return None + + # All streams in the queue are currently blocked + return None def is_done(self) -> bool: """ @@ -232,6 +321,43 @@ def is_done(self) -> bool: def _is_stream_done(self, stream_name: str) -> bool: return stream_name in self._streams_done + def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: + """ + Recursively collect all parent stream names for a given stream. + For example, if we have: epics -> issues -> comments + Then for comments, this returns {issues, epics} + + :param stream_name: The stream to collect parents for + :return: Set of all parent stream names (recursively) + """ + parent_names: Set[str] = set() + stream = self._stream_name_to_instance.get(stream_name) + + if not stream: + return parent_names + + # Get partition router if it exists (this is where parent streams are defined) + partition_router = None + + # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) + if hasattr(stream, "_stream_partition_generator") and hasattr(stream._stream_partition_generator, "_stream_slicer") and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router"): + partition_router = stream._stream_partition_generator._stream_slicer._partition_router + # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks + elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): + partition_router = stream.retriever.partition_router + + # SubstreamPartitionRouter has parent_stream_configs + if partition_router and hasattr(partition_router, "parent_stream_configs"): + for parent_config in partition_router.parent_stream_configs: + parent_stream = parent_config.stream + parent_name = parent_stream.name + parent_names.add(parent_name) + + # Recursively collect grandparents, great-grandparents, etc. + parent_names.update(self._collect_all_parent_stream_names(parent_name)) + + return parent_names + def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: self._logger.info( f"Read {self._record_counter[stream_name]} records from {stream_name} stream" @@ -248,3 +374,12 @@ def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: else AirbyteStreamStatus.COMPLETE ) yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), stream_status) + + # Remove only this stream from active set (NOT parents) + if stream_name in self._active_stream_names: + self._active_stream_names.discard(stream_name) + if self._stream_block_simultaneous_read.get(stream_name, False): + self._logger.info( + f"Stream '{stream_name}' is no longer active. " + f"Blocked streams in the queue will be retried on next start_next_partition_generator call." + ) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py index de2d93523..241282fa5 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_source.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_source.py @@ -77,7 +77,7 @@ def __init__( slice_logger: SliceLogger = DebugSliceLogger(), queue: Optional[Queue[QueueItem]] = None, message_repository: MessageRepository = InMemoryMessageRepository(), - initial_number_partitions_to_generate: int = 1, + initial_number_partitions_to_generate: int = 10, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, ) -> None: """ @@ -92,7 +92,7 @@ def __init__( self._logger = logger self._slice_logger = slice_logger self._message_repository = message_repository - self._initial_number_partitions_to_generate = initial_number_partitions_to_generate + self._initial_number_partitions_to_generate = 10 self._timeout_seconds = timeout_seconds # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 8c87508cd..81afc949c 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1560,6 +1560,17 @@ definitions: default: "" example: - "Users" + block_simultaneous_read: + title: Block Simultaneous Read + description: > + When true, prevents simultaneous reading of this stream from multiple contexts + (e.g., as both a parent stream and a standalone stream). If the stream OR any + of its parent streams are currently active, this stream will be deferred until + they finish. This is useful for APIs that don't allow concurrent access to the + same endpoint. Default is false for backward compatibility. + Only applies to ConcurrentDeclarativeSource. + type: boolean + default: false retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 93e6865d8..e43b05f90 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -2544,6 +2544,11 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") + block_simultaneous_read: Optional[bool] = Field( + False, + description="When true, prevents simultaneous reading of this stream from multiple contexts (e.g., as both a parent stream and a standalone stream). If the stream OR any of its parent streams are currently active, this stream will be deferred until they finish. This is useful for APIs that don't allow concurrent access to the same endpoint. Default is false for backward compatibility. Only applies to ConcurrentDeclarativeSource.\n", + title="Block Simultaneous Read", + ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages.", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 560dd4056..279d113de 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2145,6 +2145,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), + block_simultaneous_read=model.block_simultaneous_read or False, ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 667d088ab..0052eafa3 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -85,6 +85,17 @@ def cursor(self) -> Cursor: :return: The cursor associated with this stream. """ + @property + def block_simultaneous_read(self) -> bool: + """ + Override to return True if this stream should block simultaneous reads. + When True, prevents starting partition generation for this stream if it + OR any of its parent streams are already active. + + :return: True if simultaneous reads should be blocked, False otherwise + """ + return False # Default: allow concurrent reading + @abstractmethod def check_availability(self) -> StreamAvailability: """ diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index f5d4ccf2e..84d955105 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -26,6 +26,7 @@ def __init__( cursor: Cursor, namespace: Optional[str] = None, supports_file_transfer: bool = False, + block_simultaneous_read: bool = False, ) -> None: self._stream_partition_generator = partition_generator self._name = name @@ -36,6 +37,7 @@ def __init__( self._cursor = cursor self._namespace = namespace self._supports_file_transfer = supports_file_transfer + self._block_simultaneous_read = block_simultaneous_read def generate_partitions(self) -> Iterable[Partition]: yield from self._stream_partition_generator.generate() @@ -94,6 +96,11 @@ def log_stream_sync_configuration(self) -> None: def cursor(self) -> Cursor: return self._cursor + @property + def block_simultaneous_read(self) -> bool: + """Returns whether this stream should block simultaneous reads""" + return self._block_simultaneous_read + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index a5da89e4b..fcb71c8c6 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5281,6 +5281,144 @@ def test_catalog_defined_cursor_field_stream_missing(): assert stream._cursor_field.supports_catalog_defined_cursor_field == True +def test_block_simultaneous_read_from_manifest(): + """Test that block_simultaneous_read flows through from manifest to DefaultStream""" + content = """ + parent_stream: + type: DeclarativeStream + name: "parent" + primary_key: "id" + block_simultaneous_read: true + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/parent" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + child_stream: + type: DeclarativeStream + name: "child" + primary_key: "id" + block_simultaneous_read: true + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/child" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + partition_router: + type: SubstreamPartitionRouter + parent_stream_configs: + - type: ParentStreamConfig + stream: "#/parent_stream" + parent_key: "id" + partition_field: "parent_id" + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + parent_id: + type: string + + no_block_stream: + type: DeclarativeStream + name: "no_block" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/no_block" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + """ + + config = {"api_key": "test_key"} + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + + # Test parent stream with block_simultaneous_read: true + parent_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["parent_stream"], {} + ) + parent_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config + ) + + assert isinstance(parent_stream, DefaultStream) + assert parent_stream.name == "parent" + assert parent_stream.block_simultaneous_read is True + + # Test child stream with block_simultaneous_read: true + child_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["child_stream"], {} + ) + child_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config + ) + + assert isinstance(child_stream, DefaultStream) + assert child_stream.name == "child" + assert child_stream.block_simultaneous_read is True + + # Test stream without block_simultaneous_read (should default to False) + no_block_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["no_block_stream"], {} + ) + no_block_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config + ) + + assert isinstance(no_block_stream, DefaultStream) + assert no_block_stream.name == "no_block" + assert no_block_stream.block_simultaneous_read is False + + def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 94654713f..0e6887441 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -843,3 +843,472 @@ def test_start_next_partition_generator(self): self._thread_pool_manager.submit.assert_called_with( self._partition_enqueuer.generate_partitions, self._stream ) + + +class TestBlockSimultaneousRead(unittest.TestCase): + """Tests for block_simultaneous_read functionality""" + + def setUp(self): + self._partition_enqueuer = Mock(spec=PartitionEnqueuer) + self._thread_pool_manager = Mock(spec=ThreadPoolManager) + self._logger = Mock(spec=logging.Logger) + self._slice_logger = Mock(spec=SliceLogger) + self._message_repository = Mock(spec=MessageRepository) + self._message_repository.consume_queue.return_value = [] + self._partition_reader = Mock(spec=PartitionReader) + + def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): + """Helper to create a mock stream""" + stream = Mock(spec=AbstractStream) + stream.name = name + stream.block_simultaneous_read = block_simultaneous_read + stream.as_airbyte_stream.return_value = AirbyteStream( + name=name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + stream.cursor.ensure_at_least_one_state_emitted = Mock() + return stream + + def _create_mock_stream_with_parent( + self, name: str, parent_stream, block_simultaneous_read: bool = False + ): + """Helper to create a mock stream with a parent stream""" + stream = self._create_mock_stream(name, block_simultaneous_read) + + # Mock the retriever and partition router for parent relationship + mock_retriever = Mock() + mock_partition_router = Mock() + mock_parent_config = Mock() + mock_parent_config.stream = parent_stream + + mock_partition_router.parent_stream_configs = [mock_parent_config] + mock_retriever.partition_router = mock_partition_router + stream.retriever = mock_retriever + + return stream + + def test_defer_stream_when_self_active(self): + """Test that a stream is deferred when it's already active""" + stream = self._create_mock_stream("stream1", block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream as active + handler._active_stream_names.add("stream1") + + # Try to start the stream again + result = handler.start_next_partition_generator() + + # Should return None (no stream started) + assert result is None + + # Stream should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == stream + + # Logger should have been called to log deferral + assert any( + "Deferring stream 'stream1' because it's already active" in str(call) + for call in self._logger.info.call_args_list + ) + + def test_defer_stream_when_parent_active(self): + """Test that a stream is deferred when its parent is active""" + parent_stream = self._create_mock_stream("parent", block_simultaneous_read=True) + child_stream = self._create_mock_stream_with_parent( + "child", parent_stream, block_simultaneous_read=True + ) + + handler = ConcurrentReadProcessor( + [parent_stream, child_stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active + handler._active_stream_names.add("parent") + + # Remove parent from queue (simulate it's already started) + handler._stream_instances_to_start_partition_generation = [child_stream] + + # Try to start child + result = handler.start_next_partition_generator() + + # Should return None (child deferred) + assert result is None + + # Child should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == child_stream + + # Logger should have been called + assert any( + "Deferring stream 'child' because parent stream(s)" in str(call) + for call in self._logger.info.call_args_list + ) + + def test_defer_stream_when_grandparent_active(self): + """Test that a stream is deferred when its grandparent is active""" + grandparent = self._create_mock_stream("grandparent", block_simultaneous_read=True) + parent = self._create_mock_stream_with_parent( + "parent", grandparent, block_simultaneous_read=True + ) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [grandparent, parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark grandparent as active + handler._active_stream_names.add("grandparent") + + # Only child in queue + handler._stream_instances_to_start_partition_generation = [child] + + # Try to start child + result = handler.start_next_partition_generator() + + # Should return None (child deferred because grandparent is active) + assert result is None + + # Child should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + def test_retry_blocked_stream_after_blocker_done(self): + """Test that blocked stream is retried after blocker finishes""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream2 = self._create_mock_stream("stream2", block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [stream1, stream2], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start stream1 + handler.start_next_partition_generator() + assert "stream1" in handler._active_stream_names + + # Try to start stream2 (should be deferred since stream1 is active and they share block flag) + # But wait - they're not parent-child, so stream2 should start successfully + # Let me fix the test logic + + # Mark stream1 as active to simulate it's running + handler._active_stream_names.add("stream1") + handler._stream_instances_to_start_partition_generation = [stream1, stream2] + + # Try to start stream1 again (should be deferred because already active) + result = handler.start_next_partition_generator() + + # Should start stream2 instead (stream1 was deferred) + assert result is not None + assert "stream2" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + def test_retry_blocked_stream_after_partition_generation(self): + """Test that blocked stream is retried after partition generation completes""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Mark parent as generating partitions and having no partitions + handler._streams_currently_generating_partitions.append("parent") + handler._streams_to_running_partitions["parent"] = set() + + # Complete partition generation for parent (parent has no partitions, so it's done) + sentinel = PartitionGenerationCompletedSentinel(parent) + messages = list(handler.on_partition_generation_completed(sentinel)) + + # Child should have been started automatically by on_partition_generation_completed + # (it calls start_next_partition_generator internally) + assert "child" in handler._active_stream_names + + # Parent should be RE-ACTIVATED because child needs to read from it during partition generation + # This is the correct behavior - prevents simultaneous reads of parent + assert "parent" in handler._active_stream_names + + # Verify the queue is now empty (both streams were started) + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_blocked_stream_added_to_end_of_queue(self): + """Test that blocked streams are added to the end of the queue""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream2 = self._create_mock_stream("stream2", block_simultaneous_read=False) + stream3 = self._create_mock_stream("stream3", block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [stream1, stream2, stream3], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream1 as active + handler._active_stream_names.add("stream1") + + # Try to start streams in order: stream1, stream2, stream3 + result1 = handler.start_next_partition_generator() + + # stream1 should be deferred, stream2 should start + assert result1 is not None + assert "stream2" in handler._active_stream_names + + # Queue should now be [stream3, stream1] (stream1 moved to end) + assert len(handler._stream_instances_to_start_partition_generation) == 2 + assert handler._stream_instances_to_start_partition_generation[0] == stream3 + assert handler._stream_instances_to_start_partition_generation[1] == stream1 + + def test_no_defer_when_flag_false(self): + """Test that blocking doesn't occur when block_simultaneous_read=False""" + stream = self._create_mock_stream("stream1", block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream as active + handler._active_stream_names.add("stream1") + + # Try to start the stream again (should succeed because flag is False) + result = handler.start_next_partition_generator() + + # Should return a status message (stream started) + assert result is not None + assert isinstance(result, AirbyteMessage) + + # Queue should be empty + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_collect_parent_streams_multi_level(self): + """Test that _collect_all_parent_stream_names works recursively""" + grandparent = self._create_mock_stream("grandparent") + parent = self._create_mock_stream_with_parent("parent", grandparent) + child = self._create_mock_stream_with_parent("child", parent) + + handler = ConcurrentReadProcessor( + [grandparent, parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Collect parents for child + parents = handler._collect_all_parent_stream_names("child") + + # Should include both parent and grandparent + assert "parent" in parents + assert "grandparent" in parents + assert len(parents) == 2 + + def test_deactivate_parents_when_partition_generation_completes(self): + """Test that parent streams are deactivated when partition generation completes""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Manually mark both as active (simulating partition generation for child) + handler._active_stream_names.add("parent") + handler._active_stream_names.add("child") + handler._streams_currently_generating_partitions.append("child") + + # Ensure child has running partitions (so it doesn't trigger _on_stream_is_done) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "child" + handler._streams_to_running_partitions["child"] = {mock_partition} + + # Remove both streams from the queue so start_next_partition_generator doesn't start them + # This simulates the scenario where both streams have already been started + handler._stream_instances_to_start_partition_generation = [] + + # Complete partition generation for child + sentinel = PartitionGenerationCompletedSentinel(child) + + list(handler.on_partition_generation_completed(sentinel)) + + # Parent should be deactivated (it was only needed for partition generation) + assert "parent" not in handler._active_stream_names + + # Child should still be active (it's reading records) + assert "child" in handler._active_stream_names + + def test_deactivate_only_stream_when_done(self): + """Test that only the stream itself is deactivated when done, not parents""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark both as active + handler._active_stream_names.add("parent") + handler._active_stream_names.add("child") + + # Start child and mark it as done + handler._stream_instances_to_start_partition_generation = [] + handler._streams_currently_generating_partitions = [] + handler._streams_to_running_partitions["child"] = set() + + # Call _on_stream_is_done for child + list(handler._on_stream_is_done("child")) + + # Child should be deactivated + assert "child" not in handler._active_stream_names + + # Parent should still be active (not deactivated) + assert "parent" in handler._active_stream_names + + def test_multiple_blocked_streams_retry_in_order(self): + """Test that multiple blocked streams are retried in order""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child1 = self._create_mock_stream_with_parent( + "child1", parent, block_simultaneous_read=True + ) + child2 = self._create_mock_stream_with_parent( + "child2", parent, block_simultaneous_read=True + ) + + handler = ConcurrentReadProcessor( + [parent, child1, child2], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + result = handler.start_next_partition_generator() + assert result is not None + assert "parent" in handler._active_stream_names + + # Try to start child1 (should be deferred) + result = handler.start_next_partition_generator() + # child1 is deferred, but child2 might start if it's not blocked + # Let me check the queue state + + # Both children should be deferred (parent is active) + assert len(handler._stream_instances_to_start_partition_generation) >= 1 + + def test_child_without_flag_blocked_by_parent_with_flag(self): + """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" + # Parent has the flag, child does NOT + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active and already started (remove from queue) + handler._active_stream_names.add("parent") + handler._stream_instances_to_start_partition_generation.remove(parent) + + # Try to start child (should be deferred even though child doesn't have the flag) + result = handler.start_next_partition_generator() + + # Child should be deferred because parent has block_simultaneous_read=True and is active + assert result is None # No stream started + assert "child" not in handler._active_stream_names + # Child should be moved to end of queue (still 1 stream in queue) + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == child + + def test_child_with_flag_not_blocked_by_parent_without_flag(self): + """Test that a child WITH block_simultaneous_read is NOT blocked by parent WITHOUT the flag""" + # Parent does NOT have the flag, child does + parent = self._create_mock_stream("parent", block_simultaneous_read=False) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active and already started (remove from queue) + handler._active_stream_names.add("parent") + handler._stream_instances_to_start_partition_generation.remove(parent) + + # Try to start child (should succeed even though parent is active) + result = handler.start_next_partition_generator() + + # Child should start successfully because parent doesn't have block_simultaneous_read + assert result is not None # Stream started + assert "child" in handler._active_stream_names + # Queue should now be empty (both streams started) + assert len(handler._stream_instances_to_start_partition_generation) == 0 From 54631ac4f836d4af1661eb6ab48e0d7e12d0f376 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 6 Jan 2026 17:16:31 +0200 Subject: [PATCH 02/30] Change `block_simultaneous_read` to string --- .../concurrent_read_processor.py | 92 ++++++++++--- .../concurrent_source/concurrent_source.py | 4 +- .../declarative_component_schema.yaml | 25 ++-- .../models/declarative_component_schema.py | 6 +- .../parsers/model_to_component_factory.py | 2 +- .../streams/concurrent/abstract_stream.py | 16 ++- .../streams/concurrent/default_stream.py | 6 +- .../test_model_to_component_factory.py | 14 +- .../test_concurrent_read_processor.py | 127 ++++++++++++------ 9 files changed, 201 insertions(+), 91 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 6aaa368b4..d2de2585b 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -70,16 +70,21 @@ def __init__( # A stream is "active" if it's generating partitions or has partitions being read self._active_stream_names: Set[str] = set() - # Store which streams require blocking simultaneous reads - self._stream_block_simultaneous_read: Dict[str, bool] = { + # Store blocking group names for streams that require blocking simultaneous reads + # Maps stream name -> group name (empty string means no blocking) + self._stream_block_simultaneous_read: Dict[str, str] = { stream.name: stream.block_simultaneous_read for stream in stream_instances_to_read_from } + # Track which groups are currently active + # Maps group name -> set of stream names in that group + self._active_groups: Dict[str, Set[str]] = {} + for stream in stream_instances_to_read_from: if stream.block_simultaneous_read: self._logger.info( - f"Stream '{stream.name}' has block_simultaneous_read=True. " - f"Will defer starting this stream if it or its parents are active." + f"Stream '{stream.name}' is in blocking group '{stream.block_simultaneous_read}'. " + f"Will defer starting this stream if another stream in the same group or its parents are active." ) def on_partition_generation_completed( @@ -102,9 +107,16 @@ def on_partition_generation_completed( if parent_stream_name in self._active_stream_names: self._logger.debug(f"Removing '{parent_stream_name}' from active streams") self._active_stream_names.discard(parent_stream_name) - if self._stream_block_simultaneous_read.get(parent_stream_name, False): + + # Remove from active groups + parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") + if parent_group: + if parent_group in self._active_groups: + self._active_groups[parent_group].discard(parent_stream_name) + if not self._active_groups[parent_group]: + del self._active_groups[parent_group] self._logger.info( - f"Parent stream '{parent_stream_name}' deactivated after " + f"Parent stream '{parent_stream_name}' (group '{parent_group}') deactivated after " f"partition generation completed for child '{stream_name}'. " f"Blocked streams in the queue will be retried on next start_next_partition_generator call." ) @@ -238,30 +250,50 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: # Pop the first stream from the queue stream = self._stream_instances_to_start_partition_generation.pop(0) stream_name = stream.name + stream_group = self._stream_block_simultaneous_read.get(stream_name, "") + + # Check if this stream has a blocking group and is already active + if stream_group and stream_name in self._active_stream_names: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' (group '{stream_group}') because it's already active. Trying next stream." + ) + continue # Try the next stream in the queue - # Check if this stream has block_simultaneous_read and is already active - if self._stream_block_simultaneous_read.get(stream_name, False) and stream_name in self._active_stream_names: + # Check if this stream's group is already active (another stream in the same group is running) + if ( + stream_group + and stream_group in self._active_groups + and self._active_groups[stream_group] + ): # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) + active_streams_in_group = self._active_groups[stream_group] self._logger.info( - f"Deferring stream '{stream_name}' because it's already active " - f"(block_simultaneous_read=True). Trying next stream." + f"Deferring stream '{stream_name}' (group '{stream_group}') because other stream(s) " + f"{active_streams_in_group} in the same group are active. Trying next stream." ) continue # Try the next stream in the queue - # Check if any parent streams have block_simultaneous_read and are currently active + # Check if any parent streams have a blocking group and are currently active parent_streams = self._collect_all_parent_stream_names(stream_name) blocked_by_parents = [ - p for p in parent_streams - if self._stream_block_simultaneous_read.get(p, False) and p in self._active_stream_names + p + for p in parent_streams + if self._stream_block_simultaneous_read.get(p, "") + and p in self._active_stream_names ] if blocked_by_parents: # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) + parent_groups = { + self._stream_block_simultaneous_read.get(p, "") for p in blocked_by_parents + } self._logger.info( f"Deferring stream '{stream_name}' because parent stream(s) " - f"{blocked_by_parents} are active and have block_simultaneous_read=True. Trying next stream." + f"{blocked_by_parents} (groups {parent_groups}) are active. Trying next stream." ) continue # Try the next stream in the queue @@ -270,13 +302,24 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: self._active_stream_names.add(stream_name) self._streams_currently_generating_partitions.append(stream_name) + # Track this stream in its group if it has one + if stream_group: + if stream_group not in self._active_groups: + self._active_groups[stream_group] = set() + self._active_groups[stream_group].add(stream_name) + self._logger.debug(f"Added '{stream_name}' to active group '{stream_group}'") + # Also mark all parent streams as active (they will be read from during partition generation) parent_streams = self._collect_all_parent_stream_names(stream_name) for parent_stream_name in parent_streams: - if self._stream_block_simultaneous_read.get(parent_stream_name, False): + parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") + if parent_group: self._active_stream_names.add(parent_stream_name) + if parent_group not in self._active_groups: + self._active_groups[parent_group] = set() + self._active_groups[parent_group].add(parent_stream_name) self._logger.info( - f"Marking parent stream '{parent_stream_name}' as active " + f"Marking parent stream '{parent_stream_name}' (group '{parent_group}') as active " f"(will be read during partition generation for '{stream_name}')" ) @@ -340,7 +383,11 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: partition_router = None # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) - if hasattr(stream, "_stream_partition_generator") and hasattr(stream._stream_partition_generator, "_stream_slicer") and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router"): + if ( + hasattr(stream, "_stream_partition_generator") + and hasattr(stream._stream_partition_generator, "_stream_slicer") + and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router") + ): partition_router = stream._stream_partition_generator._stream_slicer._partition_router # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): @@ -378,8 +425,15 @@ def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: # Remove only this stream from active set (NOT parents) if stream_name in self._active_stream_names: self._active_stream_names.discard(stream_name) - if self._stream_block_simultaneous_read.get(stream_name, False): + + # Remove from active groups + stream_group = self._stream_block_simultaneous_read.get(stream_name, "") + if stream_group: + if stream_group in self._active_groups: + self._active_groups[stream_group].discard(stream_name) + if not self._active_groups[stream_group]: + del self._active_groups[stream_group] self._logger.info( - f"Stream '{stream_name}' is no longer active. " + f"Stream '{stream_name}' (group '{stream_group}') is no longer active. " f"Blocked streams in the queue will be retried on next start_next_partition_generator call." ) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py index 241282fa5..de2d93523 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_source.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_source.py @@ -77,7 +77,7 @@ def __init__( slice_logger: SliceLogger = DebugSliceLogger(), queue: Optional[Queue[QueueItem]] = None, message_repository: MessageRepository = InMemoryMessageRepository(), - initial_number_partitions_to_generate: int = 10, + initial_number_partitions_to_generate: int = 1, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, ) -> None: """ @@ -92,7 +92,7 @@ def __init__( self._logger = logger self._slice_logger = slice_logger self._message_repository = message_repository - self._initial_number_partitions_to_generate = 10 + self._initial_number_partitions_to_generate = initial_number_partitions_to_generate self._timeout_seconds = timeout_seconds # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 81afc949c..054bbdeaf 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1563,14 +1563,23 @@ definitions: block_simultaneous_read: title: Block Simultaneous Read description: > - When true, prevents simultaneous reading of this stream from multiple contexts - (e.g., as both a parent stream and a standalone stream). If the stream OR any - of its parent streams are currently active, this stream will be deferred until - they finish. This is useful for APIs that don't allow concurrent access to the - same endpoint. Default is false for backward compatibility. - Only applies to ConcurrentDeclarativeSource. - type: boolean - default: false + Optional group name for blocking simultaneous reads. Streams with the same + block_simultaneous_read value will not be read concurrently. This prevents + duplicate API calls when a stream is used as both a standalone stream and a + parent stream, or when multiple streams share the same endpoint/session. + + If set to a non-empty string, the stream will be deferred if: + 1. Another stream in the same group is currently active + 2. Any parent stream is in an active group + + Examples: + - "issues_endpoint" - All streams with this value block each other + - "" or null - No blocking (default) + + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: string + default: "" retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index e43b05f90..3246d811b 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -2544,9 +2544,9 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") - block_simultaneous_read: Optional[bool] = Field( - False, - description="When true, prevents simultaneous reading of this stream from multiple contexts (e.g., as both a parent stream and a standalone stream). If the stream OR any of its parent streams are currently active, this stream will be deferred until they finish. This is useful for APIs that don't allow concurrent access to the same endpoint. Default is false for backward compatibility. Only applies to ConcurrentDeclarativeSource.\n", + block_simultaneous_read: Optional[str] = Field( + "", + description='Optional group name for blocking simultaneous reads. Streams with the same block_simultaneous_read value will not be read concurrently. This prevents duplicate API calls when a stream is used as both a standalone stream and a parent stream, or when multiple streams share the same endpoint/session.\nIf set to a non-empty string, the stream will be deferred if: 1. Another stream in the same group is currently active 2. Any parent stream is in an active group\nExamples: - "issues_endpoint" - All streams with this value block each other - "" or null - No blocking (default)\nThis is useful for APIs that don\'t allow concurrent access to the same endpoint or session. Only applies to ConcurrentDeclarativeSource.\n', title="Block Simultaneous Read", ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 279d113de..27c750ffa 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2145,7 +2145,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=model.block_simultaneous_read or False, + block_simultaneous_read=model.block_simultaneous_read or "", ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 0052eafa3..e7b24f614 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -86,15 +86,19 @@ def cursor(self) -> Cursor: """ @property - def block_simultaneous_read(self) -> bool: + def block_simultaneous_read(self) -> str: """ - Override to return True if this stream should block simultaneous reads. - When True, prevents starting partition generation for this stream if it - OR any of its parent streams are already active. + Override to return a non-empty group name if this stream should block simultaneous reads. + When a non-empty string is returned, prevents starting partition generation for this stream if: + - Another stream with the same group name is already active + - Any of its parent streams are in an active group - :return: True if simultaneous reads should be blocked, False otherwise + This allows grouping multiple streams that share the same resource (e.g., API endpoint or session) + to prevent them from running concurrently, even if they don't have a parent-child relationship. + + :return: Group name for blocking (non-empty string), or "" to allow concurrent reading """ - return False # Default: allow concurrent reading + return "" # Default: allow concurrent reading @abstractmethod def check_availability(self) -> StreamAvailability: diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 84d955105..5ca11eaf8 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -26,7 +26,7 @@ def __init__( cursor: Cursor, namespace: Optional[str] = None, supports_file_transfer: bool = False, - block_simultaneous_read: bool = False, + block_simultaneous_read: str = "", ) -> None: self._stream_partition_generator = partition_generator self._name = name @@ -97,8 +97,8 @@ def cursor(self) -> Cursor: return self._cursor @property - def block_simultaneous_read(self) -> bool: - """Returns whether this stream should block simultaneous reads""" + def block_simultaneous_read(self) -> str: + """Returns the blocking group name for this stream, or empty string if no blocking""" return self._block_simultaneous_read def check_availability(self) -> StreamAvailability: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index fcb71c8c6..bc65f578e 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5288,7 +5288,7 @@ def test_block_simultaneous_read_from_manifest(): type: DeclarativeStream name: "parent" primary_key: "id" - block_simultaneous_read: true + block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5316,7 +5316,7 @@ def test_block_simultaneous_read_from_manifest(): type: DeclarativeStream name: "child" primary_key: "id" - block_simultaneous_read: true + block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5392,9 +5392,9 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(parent_stream, DefaultStream) assert parent_stream.name == "parent" - assert parent_stream.block_simultaneous_read is True + assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read: true + # Test child stream with block_simultaneous_read: "issues_endpoint" child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) @@ -5404,9 +5404,9 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(child_stream, DefaultStream) assert child_stream.name == "child" - assert child_stream.block_simultaneous_read is True + assert child_stream.block_simultaneous_read == "issues_endpoint" - # Test stream without block_simultaneous_read (should default to False) + # Test stream without block_simultaneous_read (should default to empty string) no_block_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["no_block_stream"], {} ) @@ -5416,7 +5416,7 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(no_block_stream, DefaultStream) assert no_block_stream.name == "no_block" - assert no_block_stream.block_simultaneous_read is False + assert no_block_stream.block_simultaneous_read == "" def get_schema_loader(stream: DefaultStream): diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 0e6887441..bd6114984 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -857,7 +857,7 @@ def setUp(self): self._message_repository.consume_queue.return_value = [] self._partition_reader = Mock(spec=PartitionReader) - def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): + def _create_mock_stream(self, name: str, block_simultaneous_read: str = ""): """Helper to create a mock stream""" stream = Mock(spec=AbstractStream) stream.name = name @@ -871,7 +871,7 @@ def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): return stream def _create_mock_stream_with_parent( - self, name: str, parent_stream, block_simultaneous_read: bool = False + self, name: str, parent_stream, block_simultaneous_read: str = "" ): """Helper to create a mock stream with a parent stream""" stream = self._create_mock_stream(name, block_simultaneous_read) @@ -890,7 +890,7 @@ def _create_mock_stream_with_parent( def test_defer_stream_when_self_active(self): """Test that a stream is deferred when it's already active""" - stream = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream = self._create_mock_stream("stream1", block_simultaneous_read="api_group") handler = ConcurrentReadProcessor( [stream], @@ -917,15 +917,16 @@ def test_defer_stream_when_self_active(self): # Logger should have been called to log deferral assert any( - "Deferring stream 'stream1' because it's already active" in str(call) + "Deferring stream 'stream1' (group 'api_group') because it's already active" + in str(call) for call in self._logger.info.call_args_list ) def test_defer_stream_when_parent_active(self): """Test that a stream is deferred when its parent is active""" - parent_stream = self._create_mock_stream("parent", block_simultaneous_read=True) + parent_stream = self._create_mock_stream("parent", block_simultaneous_read="api_group") child_stream = self._create_mock_stream_with_parent( - "child", parent_stream, block_simultaneous_read=True + "child", parent_stream, block_simultaneous_read="api_group" ) handler = ConcurrentReadProcessor( @@ -962,11 +963,13 @@ def test_defer_stream_when_parent_active(self): def test_defer_stream_when_grandparent_active(self): """Test that a stream is deferred when its grandparent is active""" - grandparent = self._create_mock_stream("grandparent", block_simultaneous_read=True) + grandparent = self._create_mock_stream("grandparent", block_simultaneous_read="api_group") parent = self._create_mock_stream_with_parent( - "parent", grandparent, block_simultaneous_read=True + "parent", grandparent, block_simultaneous_read="api_group" + ) + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" ) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) handler = ConcurrentReadProcessor( [grandparent, parent, child], @@ -994,9 +997,9 @@ def test_defer_stream_when_grandparent_active(self): assert len(handler._stream_instances_to_start_partition_generation) == 1 def test_retry_blocked_stream_after_blocker_done(self): - """Test that blocked stream is retried after blocker finishes""" - stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) - stream2 = self._create_mock_stream("stream2", block_simultaneous_read=True) + """Test that independent streams with different groups don't block each other""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="group1") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="group2") handler = ConcurrentReadProcessor( [stream1, stream2], @@ -1012,26 +1015,21 @@ def test_retry_blocked_stream_after_blocker_done(self): handler.start_next_partition_generator() assert "stream1" in handler._active_stream_names - # Try to start stream2 (should be deferred since stream1 is active and they share block flag) - # But wait - they're not parent-child, so stream2 should start successfully - # Let me fix the test logic - - # Mark stream1 as active to simulate it's running - handler._active_stream_names.add("stream1") - handler._stream_instances_to_start_partition_generation = [stream1, stream2] - - # Try to start stream1 again (should be deferred because already active) + # Stream2 should start successfully even though stream1 is active + # because they're in different groups result = handler.start_next_partition_generator() - # Should start stream2 instead (stream1 was deferred) + # Should start stream2 (different group, no blocking) assert result is not None assert "stream2" in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert len(handler._stream_instances_to_start_partition_generation) == 0 def test_retry_blocked_stream_after_partition_generation(self): """Test that blocked stream is retried after partition generation completes""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1068,9 +1066,9 @@ def test_retry_blocked_stream_after_partition_generation(self): def test_blocked_stream_added_to_end_of_queue(self): """Test that blocked streams are added to the end of the queue""" - stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) - stream2 = self._create_mock_stream("stream2", block_simultaneous_read=False) - stream3 = self._create_mock_stream("stream3", block_simultaneous_read=False) + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="api_group") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="") + stream3 = self._create_mock_stream("stream3", block_simultaneous_read="") handler = ConcurrentReadProcessor( [stream1, stream2, stream3], @@ -1098,8 +1096,8 @@ def test_blocked_stream_added_to_end_of_queue(self): assert handler._stream_instances_to_start_partition_generation[1] == stream1 def test_no_defer_when_flag_false(self): - """Test that blocking doesn't occur when block_simultaneous_read=False""" - stream = self._create_mock_stream("stream1", block_simultaneous_read=False) + """Test that blocking doesn't occur when block_simultaneous_read=""" "" + stream = self._create_mock_stream("stream1", block_simultaneous_read="") handler = ConcurrentReadProcessor( [stream], @@ -1150,8 +1148,10 @@ def test_collect_parent_streams_multi_level(self): def test_deactivate_parents_when_partition_generation_completes(self): """Test that parent streams are deactivated when partition generation completes""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1190,8 +1190,10 @@ def test_deactivate_parents_when_partition_generation_completes(self): def test_deactivate_only_stream_when_done(self): """Test that only the stream itself is deactivated when done, not parents""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1223,12 +1225,12 @@ def test_deactivate_only_stream_when_done(self): def test_multiple_blocked_streams_retry_in_order(self): """Test that multiple blocked streams are retried in order""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") child1 = self._create_mock_stream_with_parent( - "child1", parent, block_simultaneous_read=True + "child1", parent, block_simultaneous_read="api_group" ) child2 = self._create_mock_stream_with_parent( - "child2", parent, block_simultaneous_read=True + "child2", parent, block_simultaneous_read="api_group" ) handler = ConcurrentReadProcessor( @@ -1257,8 +1259,8 @@ def test_multiple_blocked_streams_retry_in_order(self): def test_child_without_flag_blocked_by_parent_with_flag(self): """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" # Parent has the flag, child does NOT - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=False) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read="") handler = ConcurrentReadProcessor( [parent, child], @@ -1277,7 +1279,7 @@ def test_child_without_flag_blocked_by_parent_with_flag(self): # Try to start child (should be deferred even though child doesn't have the flag) result = handler.start_next_partition_generator() - # Child should be deferred because parent has block_simultaneous_read=True and is active + # Child should be deferred because parent has block_simultaneous_read="api_group" and is active assert result is None # No stream started assert "child" not in handler._active_stream_names # Child should be moved to end of queue (still 1 stream in queue) @@ -1287,8 +1289,10 @@ def test_child_without_flag_blocked_by_parent_with_flag(self): def test_child_with_flag_not_blocked_by_parent_without_flag(self): """Test that a child WITH block_simultaneous_read is NOT blocked by parent WITHOUT the flag""" # Parent does NOT have the flag, child does - parent = self._create_mock_stream("parent", block_simultaneous_read=False) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1312,3 +1316,42 @@ def test_child_with_flag_not_blocked_by_parent_without_flag(self): assert "child" in handler._active_stream_names # Queue should now be empty (both streams started) assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_unrelated_streams_in_same_group_block_each_other(self): + """Test that multiple unrelated streams with the same group name block each other""" + # Create three unrelated streams (no parent-child relationship) in the same group + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="shared_endpoint") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="shared_endpoint") + stream3 = self._create_mock_stream("stream3", block_simultaneous_read="shared_endpoint") + + handler = ConcurrentReadProcessor( + [stream1, stream2, stream3], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start stream1 + result = handler.start_next_partition_generator() + assert result is not None + assert "stream1" in handler._active_stream_names + assert "shared_endpoint" in handler._active_groups + assert "stream1" in handler._active_groups["shared_endpoint"] + + # Try to start stream2 (should be deferred because it's in the same group) + result = handler.start_next_partition_generator() + # stream2 should be deferred, stream3 should also be deferred + # All three are in same group, only stream1 is active + assert result is None # No stream started + + # Both stream2 and stream3 should be in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 2 + + # Verify logger was called with deferral message + assert any( + "Deferring stream 'stream2'" in str(call) and "shared_endpoint" in str(call) + for call in self._logger.info.call_args_list + ) From 5bf631ecfbbad617d4cc8e16d465353630521ad0 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 6 Jan 2026 17:59:47 +0200 Subject: [PATCH 03/30] Fix StreamFacade --- airbyte_cdk/sources/streams/concurrent/adapters.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 41674bdae..a9bc47e0d 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -196,6 +196,11 @@ def cursor_field(self) -> Union[str, List[str]]: def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor return self._cursor + @property + def block_simultaneous_read(self) -> str: + """Returns the blocking group name from the underlying stream""" + return self._abstract_stream.block_simultaneous_read + # FIXME the lru_cache seems to be mostly there because of typing issue @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: From 0758005c294d6e2b4adc02bb4d8a63df73851276 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Fri, 9 Jan 2026 18:37:35 +0200 Subject: [PATCH 04/30] Fix NoneType error when all streams are blocked --- .../sources/concurrent_source/concurrent_read_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index d2de2585b..16183871d 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -129,7 +129,9 @@ def on_partition_generation_completed( ): yield from self._on_stream_is_done(stream_name) if self._stream_instances_to_start_partition_generation: - yield self.start_next_partition_generator() # type:ignore # None may be yielded + status_message = self.start_next_partition_generator() + if status_message: + yield status_message def on_partition(self, partition: Partition) -> None: """ From 06abd4189b5f21ed0070ef91bdd84a7fd59f9d93 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 12 Jan 2026 23:10:50 +0200 Subject: [PATCH 05/30] Fix unit tests --- .../test_concurrent_read_processor.py | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index bd6114984..15ca8205c 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1096,7 +1096,7 @@ def test_blocked_stream_added_to_end_of_queue(self): assert handler._stream_instances_to_start_partition_generation[1] == stream1 def test_no_defer_when_flag_false(self): - """Test that blocking doesn't occur when block_simultaneous_read=""" "" + """Test that blocking doesn't occur when block_simultaneous_read="" """ stream = self._create_mock_stream("stream1", block_simultaneous_read="") handler = ConcurrentReadProcessor( @@ -1247,14 +1247,55 @@ def test_multiple_blocked_streams_retry_in_order(self): result = handler.start_next_partition_generator() assert result is not None assert "parent" in handler._active_stream_names + assert "api_group" in handler._active_groups + assert "parent" in handler._active_groups["api_group"] - # Try to start child1 (should be deferred) + # Try to start next stream (child1) - should be deferred because parent is active result = handler.start_next_partition_generator() - # child1 is deferred, but child2 might start if it's not blocked - # Let me check the queue state + assert result is None # child1 was deferred - # Both children should be deferred (parent is active) - assert len(handler._stream_instances_to_start_partition_generation) >= 1 + # After first deferral, we should still have 2 streams in queue (child1 moved to end) + assert len(handler._stream_instances_to_start_partition_generation) == 2 + # child1 was moved to the back, so the queue has the other child first + queue_streams = handler._stream_instances_to_start_partition_generation + assert child1 in queue_streams + assert child2 in queue_streams + + # Try to start next stream (child2) - should also be deferred + result = handler.start_next_partition_generator() + assert result is None # child2 was deferred + + # Both streams still in queue, but order may have changed + assert len(handler._stream_instances_to_start_partition_generation) == 2 + + # Verify neither child is active yet (both blocked by parent) + assert "child1" not in handler._active_stream_names + assert "child2" not in handler._active_stream_names + + # Verify deferral was logged for both children + logger_calls = [str(call) for call in self._logger.info.call_args_list] + assert any("Deferring stream 'child1'" in call for call in logger_calls) + assert any("Deferring stream 'child2'" in call for call in logger_calls) + + # Simulate parent completing partition generation (parent has no partitions, so it's done) + handler._streams_currently_generating_partitions.append("parent") + handler._streams_to_running_partitions["parent"] = set() + sentinel = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel)) + + # After parent completes, one of the children should start (whichever was first in queue) + # We know at least one child started because the queue shrunk + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Verify that exactly one child is now active + children_active = [ + name for name in ["child1", "child2"] + if name in handler._active_stream_names + ] + assert len(children_active) == 1, f"Expected exactly one child active, got: {children_active}" + + # Parent should be re-activated because the active child needs to read from it + assert "parent" in handler._active_stream_names def test_child_without_flag_blocked_by_parent_with_flag(self): """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" From 61c10722a5f50038de5413937f268697b25bb305 Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Mon, 12 Jan 2026 21:13:12 +0000 Subject: [PATCH 06/30] Auto-fix lint and format issues --- .../streams/concurrent/test_concurrent_read_processor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 15ca8205c..9b554c195 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1289,10 +1289,11 @@ def test_multiple_blocked_streams_retry_in_order(self): # Verify that exactly one child is now active children_active = [ - name for name in ["child1", "child2"] - if name in handler._active_stream_names + name for name in ["child1", "child2"] if name in handler._active_stream_names ] - assert len(children_active) == 1, f"Expected exactly one child active, got: {children_active}" + assert len(children_active) == 1, ( + f"Expected exactly one child active, got: {children_active}" + ) # Parent should be re-activated because the active child needs to read from it assert "parent" in handler._active_stream_names From 1c1555b9cf35c06ff3acee0b66a2149a4cc217a9 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 16:53:33 +0200 Subject: [PATCH 07/30] Add retry deferred streams on stream completion --- .../concurrent_read_processor.py | 6 + .../test_concurrent_read_processor.py | 112 ++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 16183871d..95bae264c 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -159,6 +159,7 @@ def on_partition_complete_sentinel( 1. Close the partition 2. If the stream is done, mark it as such and return a stream status message 3. Emit messages that were added to the message repository + 4. If there are more streams to read from, start the next partition generator """ partition = sentinel.partition @@ -171,6 +172,11 @@ def on_partition_complete_sentinel( and len(partitions_running) == 0 ): yield from self._on_stream_is_done(partition.stream_name()) + # Try to start the next stream in the queue (may be a deferred stream) + if self._stream_instances_to_start_partition_generation: + status_message = self.start_next_partition_generator() + if status_message: + yield status_message yield from self._message_repository.consume_queue() def on_record(self, record: Record) -> Iterable[AirbyteMessage]: diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 9b554c195..146f08412 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1397,3 +1397,115 @@ def test_unrelated_streams_in_same_group_block_each_other(self): "Deferring stream 'stream2'" in str(call) and "shared_endpoint" in str(call) for call in self._logger.info.call_args_list ) + + def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): + """Test that child stream starts after parent completes via on_partition_complete_sentinel""" + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Try to start child (should be deferred) + result = handler.start_next_partition_generator() + assert result is None + assert "child" not in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Create a partition for parent and add it to running partitions + # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "parent" + handler._streams_to_running_partitions["parent"].add(mock_partition) + + # Complete partition generation for parent + sentinel_gen = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel_gen)) + + # Now complete the partition (this triggers stream done) + sentinel_complete = PartitionCompleteSentinel(mock_partition) + messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) + + # Child should have been started automatically + assert "child" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + # Verify a STARTED message was emitted for child + started_messages = [ + msg + for msg in messages + if msg.type == MessageType.TRACE + and msg.trace.stream_status + and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED + ] + assert len(started_messages) == 1 + assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" + + def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): + """Test that child stream starts after parent completes via on_partition_complete_sentinel""" + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Try to start child (should be deferred) + result = handler.start_next_partition_generator() + assert result is None + assert "child" not in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Create a partition for parent and add it to running partitions + # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "parent" + handler._streams_to_running_partitions["parent"].add(mock_partition) + + # Complete partition generation for parent + sentinel_gen = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel_gen)) + + # Now complete the partition (this triggers stream done) + sentinel_complete = PartitionCompleteSentinel(mock_partition) + messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) + + # Child should have been started automatically + assert "child" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + # Verify a STARTED message was emitted for child + started_messages = [ + msg + for msg in messages + if msg.type == MessageType.TRACE + and msg.trace.stream_status + and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED + ] + assert len(started_messages) == 1 + assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" From b3a98f414f181d9087cbaf2ee6343221dde12b5e Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 17:44:32 +0200 Subject: [PATCH 08/30] Fix unit tests --- .../test_concurrent_read_processor.py | 58 +------------------ 1 file changed, 1 insertion(+), 57 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 146f08412..7ecf3b48d 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -996,7 +996,7 @@ def test_defer_stream_when_grandparent_active(self): # Child should be back in the queue assert len(handler._stream_instances_to_start_partition_generation) == 1 - def test_retry_blocked_stream_after_blocker_done(self): + def test_different_groups_do_not_block_each_other(self): """Test that independent streams with different groups don't block each other""" stream1 = self._create_mock_stream("stream1", block_simultaneous_read="group1") stream2 = self._create_mock_stream("stream2", block_simultaneous_read="group2") @@ -1453,59 +1453,3 @@ def test_child_starts_after_parent_completes_via_partition_complete_sentinel(sel ] assert len(started_messages) == 1 assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" - - def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): - """Test that child stream starts after parent completes via on_partition_complete_sentinel""" - parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") - child = self._create_mock_stream_with_parent( - "child", parent, block_simultaneous_read="api_group" - ) - - handler = ConcurrentReadProcessor( - [parent, child], - self._partition_enqueuer, - self._thread_pool_manager, - self._logger, - self._slice_logger, - self._message_repository, - self._partition_reader, - ) - - # Start parent - handler.start_next_partition_generator() - assert "parent" in handler._active_stream_names - - # Try to start child (should be deferred) - result = handler.start_next_partition_generator() - assert result is None - assert "child" not in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 1 - - # Create a partition for parent and add it to running partitions - # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) - mock_partition = Mock(spec=Partition) - mock_partition.stream_name.return_value = "parent" - handler._streams_to_running_partitions["parent"].add(mock_partition) - - # Complete partition generation for parent - sentinel_gen = PartitionGenerationCompletedSentinel(parent) - list(handler.on_partition_generation_completed(sentinel_gen)) - - # Now complete the partition (this triggers stream done) - sentinel_complete = PartitionCompleteSentinel(mock_partition) - messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) - - # Child should have been started automatically - assert "child" in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 0 - - # Verify a STARTED message was emitted for child - started_messages = [ - msg - for msg in messages - if msg.type == MessageType.TRACE - and msg.trace.stream_status - and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED - ] - assert len(started_messages) == 1 - assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" From 0377b802bb2e3c2dba9fd87e7d54c6e265d4c203 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 18:26:18 +0200 Subject: [PATCH 09/30] More fixes for unit tests --- .../streams/concurrent/test_concurrent_read_processor.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 7ecf3b48d..a8db1e65f 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1045,10 +1045,6 @@ def test_retry_blocked_stream_after_partition_generation(self): handler.start_next_partition_generator() assert "parent" in handler._active_stream_names - # Mark parent as generating partitions and having no partitions - handler._streams_currently_generating_partitions.append("parent") - handler._streams_to_running_partitions["parent"] = set() - # Complete partition generation for parent (parent has no partitions, so it's done) sentinel = PartitionGenerationCompletedSentinel(parent) messages = list(handler.on_partition_generation_completed(sentinel)) @@ -1278,8 +1274,6 @@ def test_multiple_blocked_streams_retry_in_order(self): assert any("Deferring stream 'child2'" in call for call in logger_calls) # Simulate parent completing partition generation (parent has no partitions, so it's done) - handler._streams_currently_generating_partitions.append("parent") - handler._streams_to_running_partitions["parent"] = set() sentinel = PartitionGenerationCompletedSentinel(parent) list(handler.on_partition_generation_completed(sentinel)) From 8acab483bfc8af76133296161e0e9aa9f4cbbef6 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:48:05 +0000 Subject: [PATCH 10/30] refactor: replace per-stream block_simultaneous_read with top-level stream_groups - Remove block_simultaneous_read property from DeclarativeStream schema - Add top-level stream_groups with StreamGroup and BlockSimultaneousSyncsAction - ConcurrentDeclarativeSource parses stream_groups and injects block_simultaneous_read into stream configs before factory processing - Internal blocking logic in ConcurrentReadProcessor unchanged - Update tests for new interface Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 42 ++++++++++- .../declarative_component_schema.yaml | 66 +++++++++++------ .../models/declarative_component_schema.py | 32 +++++++-- .../parsers/model_to_component_factory.py | 2 +- .../test_model_to_component_factory.py | 14 ++-- .../test_concurrent_declarative_source.py | 71 +++++++++++++++++++ 6 files changed, 196 insertions(+), 31 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 45fe6aa2d..cdd86fda9 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,6 +405,14 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) + stream_name_to_group = self._build_stream_name_to_group(self._source_config) + + prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) + for stream_config in prepared_configs: + stream_name = stream_config.get("name", "") + if stream_name in stream_name_to_group: + stream_config["block_simultaneous_read"] = stream_name_to_group[stream_name] + source_streams = [ self._constructor.create_component( ( @@ -416,7 +424,7 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i self._config, emit_connector_builder_messages=self._emit_connector_builder_messages, ) - for stream_config in self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) + for stream_config in prepared_configs ] return source_streams @@ -526,6 +534,38 @@ def dynamic_streams(self) -> List[Dict[str, Any]]: with_dynamic_stream_name=True, ) + @staticmethod + def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: + """Build a mapping from stream name to group name based on the stream_groups manifest config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + + Returns: + A dict mapping stream name -> group name for streams that belong to a group. + """ + stream_name_to_group: Dict[str, str] = {} + stream_groups = manifest.get("stream_groups", {}) + if not stream_groups: + return stream_name_to_group + + for group_name, group_config in stream_groups.items(): + streams = group_config.get("streams", []) + for stream_ref in streams: + if isinstance(stream_ref, dict): + # After reference resolution, stream_ref is a full stream definition dict + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + elif isinstance(stream_ref, str): + # If not resolved (shouldn't happen normally), extract name from ref path + # e.g., "#/definitions/my_stream" -> "my_stream" + if stream_ref.startswith("#/definitions/"): + stream_name = stream_ref.split("/")[-1] + stream_name_to_group[stream_name] = group_name + + return stream_name_to_group + def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config stream_configs = [] diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 054bbdeaf..9e28a74af 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -45,6 +45,15 @@ properties: "$ref": "#/definitions/ConcurrencyLevel" api_budget: "$ref": "#/definitions/HTTPAPIBudget" + stream_groups: + title: Stream Groups + description: > + Groups of streams that share a common resource and should not be read simultaneously. + Each group defines a set of stream references and an action that controls how concurrent + reads are managed. Only applies to ConcurrentDeclarativeSource. + type: object + additionalProperties: + "$ref": "#/definitions/StreamGroup" max_concurrent_async_job_count: title: Maximum Concurrent Asynchronous Jobs description: Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information. @@ -63,6 +72,43 @@ properties: description: A description of the connector. It will be presented on the Source documentation page. additionalProperties: false definitions: + StreamGroup: + title: Stream Group + description: > + A group of streams that share a common resource and should not be read simultaneously. + Streams in the same group will be blocked from concurrent reads based on the specified action. + type: object + required: + - streams + - action + properties: + streams: + title: Streams + description: > + List of references to streams that belong to this group. Use JSON references + to stream definitions (e.g., "#/definitions/my_stream"). + type: array + items: + type: string + action: + title: Action + description: The action to apply to streams in this group. + "$ref": "#/definitions/BlockSimultaneousSyncsAction" + BlockSimultaneousSyncsAction: + title: Block Simultaneous Syncs Action + description: > + Action that prevents streams in the same group from being read concurrently. + When applied to a stream group, streams with this action will be deferred if + another stream in the same group is currently active. + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: object + required: + - type + properties: + type: + type: string + enum: [BlockSimultaneousSyncsAction] AddedFieldDefinition: title: Definition Of Field To Add description: Defines the field to add on a record. @@ -1560,26 +1606,6 @@ definitions: default: "" example: - "Users" - block_simultaneous_read: - title: Block Simultaneous Read - description: > - Optional group name for blocking simultaneous reads. Streams with the same - block_simultaneous_read value will not be read concurrently. This prevents - duplicate API calls when a stream is used as both a standalone stream and a - parent stream, or when multiple streams share the same endpoint/session. - - If set to a non-empty string, the stream will be deferred if: - 1. Another stream in the same group is currently active - 2. Any parent stream is in an active group - - Examples: - - "issues_endpoint" - All streams with this value block each other - - "" or null - No blocking (default) - - This is useful for APIs that don't allow concurrent access to the same - endpoint or session. Only applies to ConcurrentDeclarativeSource. - type: string - default: "" retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 3246d811b..4175bf364 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -2354,6 +2354,23 @@ class Config: ) +class BlockSimultaneousSyncsAction(BaseModel): + type: Literal["BlockSimultaneousSyncsAction"] + + +class StreamGroup(BaseModel): + streams: List[str] = Field( + ..., + description='List of references to streams that belong to this group. Use JSON references to stream definitions (e.g., "#/definitions/my_stream").', + title="Streams", + ) + action: BlockSimultaneousSyncsAction = Field( + ..., + description="The action to apply to streams in this group.", + title="Action", + ) + + class Spec(BaseModel): type: Literal["Spec"] connection_specification: Dict[str, Any] = Field( @@ -2394,6 +2411,11 @@ class Config: spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None api_budget: Optional[HTTPAPIBudget] = None + stream_groups: Optional[Dict[str, StreamGroup]] = Field( + None, + description="Groups of streams that share a common resource and should not be read simultaneously. Each group defines a set of stream references and an action that controls how concurrent reads are managed. Only applies to ConcurrentDeclarativeSource.", + title="Stream Groups", + ) max_concurrent_async_job_count: Optional[Union[int, str]] = Field( None, description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.", @@ -2429,6 +2451,11 @@ class Config: spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None api_budget: Optional[HTTPAPIBudget] = None + stream_groups: Optional[Dict[str, StreamGroup]] = Field( + None, + description="Groups of streams that share a common resource and should not be read simultaneously. Each group defines a set of stream references and an action that controls how concurrent reads are managed. Only applies to ConcurrentDeclarativeSource.", + title="Stream Groups", + ) max_concurrent_async_job_count: Optional[Union[int, str]] = Field( None, description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.", @@ -2544,11 +2571,6 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") - block_simultaneous_read: Optional[str] = Field( - "", - description='Optional group name for blocking simultaneous reads. Streams with the same block_simultaneous_read value will not be read concurrently. This prevents duplicate API calls when a stream is used as both a standalone stream and a parent stream, or when multiple streams share the same endpoint/session.\nIf set to a non-empty string, the stream will be deferred if: 1. Another stream in the same group is currently active 2. Any parent stream is in an active group\nExamples: - "issues_endpoint" - All streams with this value block each other - "" or null - No blocking (default)\nThis is useful for APIs that don\'t allow concurrent access to the same endpoint or session. Only applies to ConcurrentDeclarativeSource.\n', - title="Block Simultaneous Read", - ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages.", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 27c750ffa..41b134982 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2145,7 +2145,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=model.block_simultaneous_read or "", + block_simultaneous_read=getattr(model, "block_simultaneous_read", "") or "", ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index bc65f578e..91b17fc33 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5281,8 +5281,14 @@ def test_catalog_defined_cursor_field_stream_missing(): assert stream._cursor_field.supports_catalog_defined_cursor_field == True -def test_block_simultaneous_read_from_manifest(): - """Test that block_simultaneous_read flows through from manifest to DefaultStream""" +def test_block_simultaneous_read_from_stream_groups(): + """Test that block_simultaneous_read flows through from stream_groups to DefaultStream. + + The stream_groups config is processed by ConcurrentDeclarativeSource which injects + block_simultaneous_read into individual stream configs before passing them to the factory. + This test verifies that the factory correctly reads block_simultaneous_read from the + extra fields on the stream config dict. + """ content = """ parent_stream: type: DeclarativeStream @@ -5382,7 +5388,7 @@ def test_block_simultaneous_read_from_manifest(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream with block_simultaneous_read: true + # Test parent stream with block_simultaneous_read injected (as ConcurrentDeclarativeSource would do) parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["parent_stream"], {} ) @@ -5394,7 +5400,7 @@ def test_block_simultaneous_read_from_manifest(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read: "issues_endpoint" + # Test child stream with block_simultaneous_read injected child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index bde6c35b1..3f10b9eb2 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5150,3 +5150,74 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): with pytest.raises(ValueError): list(source.read(logger=source.logger, config=input_config, catalog=catalog, state=[])) + + +@pytest.mark.parametrize( + "manifest,expected", + [ + pytest.param( + {}, + {}, + id="no_stream_groups", + ), + pytest.param( + {"stream_groups": {}}, + {}, + id="empty_stream_groups", + ), + pytest.param( + { + "stream_groups": { + "crm_objects": { + "streams": [ + {"name": "deals", "type": "DeclarativeStream"}, + {"name": "companies", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + }, + {"deals": "crm_objects", "companies": "crm_objects"}, + id="resolved_stream_refs", + ), + pytest.param( + { + "stream_groups": { + "group_a": { + "streams": [ + {"name": "stream1", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + "group_b": { + "streams": [ + {"name": "stream2", "type": "DeclarativeStream"}, + {"name": "stream3", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + } + }, + {"stream1": "group_a", "stream2": "group_b", "stream3": "group_b"}, + id="multiple_groups", + ), + pytest.param( + { + "stream_groups": { + "fallback_group": { + "streams": [ + "#/definitions/my_stream", + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + }, + {"my_stream": "fallback_group"}, + id="unresolved_string_refs_fallback", + ), + ], +) +def test_build_stream_name_to_group(manifest, expected): + """Test _build_stream_name_to_group correctly maps stream names to group names.""" + result = ConcurrentDeclarativeSource._build_stream_name_to_group(manifest) + assert result == expected From c33a61b13b23e97bde96bfbb1d97b3d277e70e74 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 15:23:18 +0000 Subject: [PATCH 11/30] refactor: move stream_name_to_group into ModelToComponentFactory - Add stream_name_to_group parameter to ModelToComponentFactory.__init__() - Add set_stream_name_to_group() method for post-init configuration - Factory now looks up block_simultaneous_read from its own mapping - Remove config injection hack from ConcurrentDeclarativeSource.streams() - Update tests to use factory-based approach instead of extra fields Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 8 +- .../parsers/model_to_component_factory.py | 8 +- .../test_model_to_component_factory.py | 82 ++++++++++++++++--- 3 files changed, 80 insertions(+), 18 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index cdd86fda9..c0ab9e910 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,13 +405,11 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - stream_name_to_group = self._build_stream_name_to_group(self._source_config) + self._constructor.set_stream_name_to_group( + self._build_stream_name_to_group(self._source_config) + ) prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) - for stream_config in prepared_configs: - stream_name = stream_config.get("name", "") - if stream_name in stream_name_to_group: - stream_config["block_simultaneous_read"] = stream_name_to_group[stream_name] source_streams = [ self._constructor.create_component( diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 41b134982..0faf1b4db 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -689,6 +689,7 @@ def __init__( max_concurrent_async_job_count: Optional[int] = None, configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, api_budget: Optional[APIBudget] = None, + stream_name_to_group: Optional[Dict[str, str]] = None, ): self._init_mappings() self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice @@ -705,9 +706,14 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) + self._stream_name_to_group: Dict[str, str] = stream_name_to_group or {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] + def set_stream_name_to_group(self, stream_name_to_group: Dict[str, str]) -> None: + """Set the mapping from stream name to group name for block_simultaneous_read.""" + self._stream_name_to_group = stream_name_to_group + def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { AddedFieldDefinitionModel: self.create_added_field_definition, @@ -2145,7 +2151,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=getattr(model, "block_simultaneous_read", "") or "", + block_simultaneous_read=self._stream_name_to_group.get(stream_name, ""), ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 91b17fc33..00b9b9270 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5282,19 +5282,17 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that block_simultaneous_read flows through from stream_groups to DefaultStream. + """Test that block_simultaneous_read flows through from stream_name_to_group to DefaultStream. - The stream_groups config is processed by ConcurrentDeclarativeSource which injects - block_simultaneous_read into individual stream configs before passing them to the factory. - This test verifies that the factory correctly reads block_simultaneous_read from the - extra fields on the stream config dict. + The stream_groups config is parsed by ConcurrentDeclarativeSource into a stream_name_to_group + mapping, which is then set on the ModelToComponentFactory. The factory uses this mapping to + look up the group for each stream it creates. """ content = """ parent_stream: type: DeclarativeStream name: "parent" primary_key: "id" - block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5322,7 +5320,6 @@ def test_block_simultaneous_read_from_stream_groups(): type: DeclarativeStream name: "child" primary_key: "id" - block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5385,14 +5382,19 @@ def test_block_simultaneous_read_from_stream_groups(): config = {"api_key": "test_key"} + # Create a factory with stream_name_to_group mapping (as ConcurrentDeclarativeSource would do) + factory_with_groups = ModelToComponentFactory( + stream_name_to_group={"parent": "issues_endpoint", "child": "issues_endpoint"} + ) + parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream with block_simultaneous_read injected (as ConcurrentDeclarativeSource would do) + # Test parent stream gets block_simultaneous_read from the factory's stream_name_to_group parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["parent_stream"], {} ) - parent_stream: DefaultStream = factory.create_component( + parent_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config ) @@ -5400,11 +5402,11 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read injected + # Test child stream gets block_simultaneous_read from the factory's stream_name_to_group child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) - child_stream: DefaultStream = factory.create_component( + child_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config ) @@ -5416,7 +5418,7 @@ def test_block_simultaneous_read_from_stream_groups(): no_block_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["no_block_stream"], {} ) - no_block_stream: DefaultStream = factory.create_component( + no_block_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config ) @@ -5425,6 +5427,62 @@ def test_block_simultaneous_read_from_stream_groups(): assert no_block_stream.block_simultaneous_read == "" +def test_set_stream_name_to_group(): + """Test that set_stream_name_to_group updates the factory's stream_name_to_group mapping.""" + content = """ + test_stream: + type: DeclarativeStream + name: "test" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/test" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + """ + + config = {"api_key": "test_key"} + + # Create factory without stream_name_to_group + test_factory = ModelToComponentFactory() + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + stream_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["test_stream"], {} + ) + + # Without stream_name_to_group, block_simultaneous_read should be empty + stream: DefaultStream = test_factory.create_component( + model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config + ) + assert stream.block_simultaneous_read == "" + + # After setting stream_name_to_group, block_simultaneous_read should be populated + test_factory.set_stream_name_to_group({"test": "my_group"}) + stream = test_factory.create_component( + model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config + ) + assert stream.block_simultaneous_read == "my_group" + + def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, From 8efc56c98e825675be1fe2be05f9e4024eb58458 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 16:26:09 +0000 Subject: [PATCH 12/30] refactor: use stream_groups manifest in factory test instead of hardcoded dict - Test now defines stream_groups with references in the manifest YAML - Uses _build_stream_name_to_group() to derive the mapping from manifest - Removed test_set_stream_name_to_group (redundant with the manifest-based test) - Added ConcurrentDeclarativeSource import for _build_stream_name_to_group Co-Authored-By: unknown <> --- .../test_model_to_component_factory.py | 273 +++++++----------- 1 file changed, 111 insertions(+), 162 deletions(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 00b9b9270..ddedf8221 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -48,6 +48,9 @@ ) from airbyte_cdk.sources.declarative.checks import CheckStream from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime from airbyte_cdk.sources.declarative.decoders import JsonDecoder, PaginationDecoderDecorator from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector @@ -5282,117 +5285,119 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that block_simultaneous_read flows through from stream_name_to_group to DefaultStream. - - The stream_groups config is parsed by ConcurrentDeclarativeSource into a stream_name_to_group - mapping, which is then set on the ModelToComponentFactory. The factory uses this mapping to - look up the group for each stream it creates. - """ + """Test that stream_groups in the manifest flow through to DefaultStream.block_simultaneous_read.""" content = """ - parent_stream: - type: DeclarativeStream - name: "parent" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/parent" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - - child_stream: - type: DeclarativeStream - name: "child" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/child" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - partition_router: - type: SubstreamPartitionRouter - parent_stream_configs: - - type: ParentStreamConfig - stream: "#/parent_stream" - parent_key: "id" - partition_field: "parent_id" - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - parent_id: - type: string - - no_block_stream: - type: DeclarativeStream - name: "no_block" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/no_block" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string + definitions: + parent_stream: + type: DeclarativeStream + name: "parent" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/parent" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + child_stream: + type: DeclarativeStream + name: "child" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/child" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + partition_router: + type: SubstreamPartitionRouter + parent_stream_configs: + - type: ParentStreamConfig + stream: "#/definitions/parent_stream" + parent_key: "id" + partition_field: "parent_id" + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + parent_id: + type: string + + no_block_stream: + type: DeclarativeStream + name: "no_block" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/no_block" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + stream_groups: + issues_endpoint: + streams: + - "#/definitions/parent_stream" + - "#/definitions/child_stream" + action: BlockSimultaneousSyncsAction """ config = {"api_key": "test_key"} - # Create a factory with stream_name_to_group mapping (as ConcurrentDeclarativeSource would do) - factory_with_groups = ModelToComponentFactory( - stream_name_to_group={"parent": "issues_endpoint", "child": "issues_endpoint"} - ) - parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream gets block_simultaneous_read from the factory's stream_name_to_group + # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) + stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group(resolved_manifest) + factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) + + # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["parent_stream"], {} + "", resolved_manifest["definitions"]["parent_stream"], {} ) parent_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config @@ -5402,9 +5407,9 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream gets block_simultaneous_read from the factory's stream_name_to_group + # Test child stream gets block_simultaneous_read from stream_groups child_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["child_stream"], {} + "", resolved_manifest["definitions"]["child_stream"], {} ) child_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config @@ -5414,9 +5419,9 @@ def test_block_simultaneous_read_from_stream_groups(): assert child_stream.name == "child" assert child_stream.block_simultaneous_read == "issues_endpoint" - # Test stream without block_simultaneous_read (should default to empty string) + # Test stream not in any group defaults to empty string no_block_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["no_block_stream"], {} + "", resolved_manifest["definitions"]["no_block_stream"], {} ) no_block_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config @@ -5427,62 +5432,6 @@ def test_block_simultaneous_read_from_stream_groups(): assert no_block_stream.block_simultaneous_read == "" -def test_set_stream_name_to_group(): - """Test that set_stream_name_to_group updates the factory's stream_name_to_group mapping.""" - content = """ - test_stream: - type: DeclarativeStream - name: "test" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/test" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - """ - - config = {"api_key": "test_key"} - - # Create factory without stream_name_to_group - test_factory = ModelToComponentFactory() - - parsed_manifest = YamlDeclarativeSource._parse(content) - resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - stream_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["test_stream"], {} - ) - - # Without stream_name_to_group, block_simultaneous_read should be empty - stream: DefaultStream = test_factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config - ) - assert stream.block_simultaneous_read == "" - - # After setting stream_name_to_group, block_simultaneous_read should be populated - test_factory.set_stream_name_to_group({"test": "my_group"}) - stream = test_factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config - ) - assert stream.block_simultaneous_read == "my_group" - - def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, From 4bf56ecb90568baacfc8c6c97d638c88508b7fe0 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:09:17 +0000 Subject: [PATCH 13/30] fix: only include parent stream in stream_groups to avoid deadlock Child streams that depend on parent streams should not be in the same group, as this would cause a deadlock (child needs to read parent). Co-Authored-By: unknown <> --- .../declarative/parsers/test_model_to_component_factory.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index ddedf8221..964e3cd4a 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5382,7 +5382,6 @@ def test_block_simultaneous_read_from_stream_groups(): issues_endpoint: streams: - "#/definitions/parent_stream" - - "#/definitions/child_stream" action: BlockSimultaneousSyncsAction """ @@ -5407,7 +5406,7 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream gets block_simultaneous_read from stream_groups + # Test child stream is NOT in the group (to avoid deadlock with parent) child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["definitions"]["child_stream"], {} ) @@ -5417,7 +5416,7 @@ def test_block_simultaneous_read_from_stream_groups(): assert isinstance(child_stream, DefaultStream) assert child_stream.name == "child" - assert child_stream.block_simultaneous_read == "issues_endpoint" + assert child_stream.block_simultaneous_read == "" # Test stream not in any group defaults to empty string no_block_manifest = transformer.propagate_types_and_parameters( From 276c007a55bb7c6ee11d2cac98a61cdb80545471 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:20:54 +0000 Subject: [PATCH 14/30] style: fix ruff format for long line Co-Authored-By: unknown <> --- .../declarative/parsers/test_model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 964e3cd4a..bf877ff1a 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5391,7 +5391,9 @@ def test_block_simultaneous_read_from_stream_groups(): resolved_manifest = resolver.preprocess_manifest(parsed_manifest) # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) - stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group(resolved_manifest) + stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group( + resolved_manifest + ) factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) # Test parent stream gets block_simultaneous_read from stream_groups From 59fbd79d25d43a8b13ad66f8cd212dcd87c31666 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 27 Feb 2026 15:07:31 +0000 Subject: [PATCH 15/30] refactor: move _build_stream_name_to_group into ModelToComponentFactory - Factory now owns the stream_groups resolution via set_stream_groups(manifest) - ConcurrentDeclarativeSource just calls factory.set_stream_groups(manifest) - Removed _build_stream_name_to_group from ConcurrentDeclarativeSource - Updated tests to use factory's _build_stream_name_to_group directly Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 36 +-------------- .../parsers/model_to_component_factory.py | 45 ++++++++++++++++--- .../test_model_to_component_factory.py | 11 ++--- .../test_concurrent_declarative_source.py | 5 ++- 4 files changed, 48 insertions(+), 49 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index c0ab9e910..2eeee093e 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,9 +405,7 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - self._constructor.set_stream_name_to_group( - self._build_stream_name_to_group(self._source_config) - ) + self._constructor.set_stream_groups(self._source_config) prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) @@ -532,38 +530,6 @@ def dynamic_streams(self) -> List[Dict[str, Any]]: with_dynamic_stream_name=True, ) - @staticmethod - def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: - """Build a mapping from stream name to group name based on the stream_groups manifest config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - - Returns: - A dict mapping stream name -> group name for streams that belong to a group. - """ - stream_name_to_group: Dict[str, str] = {} - stream_groups = manifest.get("stream_groups", {}) - if not stream_groups: - return stream_name_to_group - - for group_name, group_config in stream_groups.items(): - streams = group_config.get("streams", []) - for stream_ref in streams: - if isinstance(stream_ref, dict): - # After reference resolution, stream_ref is a full stream definition dict - stream_name = stream_ref.get("name", "") - if stream_name: - stream_name_to_group[stream_name] = group_name - elif isinstance(stream_ref, str): - # If not resolved (shouldn't happen normally), extract name from ref path - # e.g., "#/definitions/my_stream" -> "my_stream" - if stream_ref.startswith("#/definitions/"): - stream_name = stream_ref.split("/")[-1] - stream_name_to_group[stream_name] = group_name - - return stream_name_to_group - def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config stream_configs = [] diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 0faf1b4db..a2ba0377f 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -689,7 +689,6 @@ def __init__( max_concurrent_async_job_count: Optional[int] = None, configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, api_budget: Optional[APIBudget] = None, - stream_name_to_group: Optional[Dict[str, str]] = None, ): self._init_mappings() self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice @@ -706,13 +705,49 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) - self._stream_name_to_group: Dict[str, str] = stream_name_to_group or {} + self._stream_name_to_group: Dict[str, str] = {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] - def set_stream_name_to_group(self, stream_name_to_group: Dict[str, str]) -> None: - """Set the mapping from stream name to group name for block_simultaneous_read.""" - self._stream_name_to_group = stream_name_to_group + def set_stream_groups(self, manifest: Mapping[str, Any]) -> None: + """Build and set the stream-name-to-group mapping from the manifest's stream_groups config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + """ + self._stream_name_to_group = self._build_stream_name_to_group(manifest) + + @staticmethod + def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: + """Build a mapping from stream name to group name based on the stream_groups manifest config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + + Returns: + A dict mapping stream name -> group name for streams that belong to a group. + """ + stream_name_to_group: Dict[str, str] = {} + stream_groups = manifest.get("stream_groups", {}) + if not stream_groups: + return stream_name_to_group + + for group_name, group_config in stream_groups.items(): + streams = group_config.get("streams", []) + for stream_ref in streams: + if isinstance(stream_ref, dict): + # After reference resolution, stream_ref is a full stream definition dict + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + elif isinstance(stream_ref, str): + # If not resolved (shouldn't happen normally), extract name from ref path + # e.g., "#/definitions/my_stream" -> "my_stream" + if stream_ref.startswith("#/definitions/"): + stream_name = stream_ref.split("/")[-1] + stream_name_to_group[stream_name] = group_name + + return stream_name_to_group def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index bf877ff1a..d27d23613 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -48,9 +48,6 @@ ) from airbyte_cdk.sources.declarative.checks import CheckStream from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel -from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( - ConcurrentDeclarativeSource, -) from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime from airbyte_cdk.sources.declarative.decoders import JsonDecoder, PaginationDecoderDecorator from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector @@ -5390,11 +5387,9 @@ def test_block_simultaneous_read_from_stream_groups(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) - stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group( - resolved_manifest - ) - factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) + # Use the factory's set_stream_groups to resolve stream_groups from the manifest + factory_with_groups = ModelToComponentFactory() + factory_with_groups.set_stream_groups(resolved_manifest) # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 3f10b9eb2..7676f8e2d 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -56,6 +56,9 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import ( ClientSideIncrementalRecordFilterDecorator, ) +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( + ModelToComponentFactory, +) from airbyte_cdk.sources.declarative.partition_routers import AsyncJobPartitionRouter from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( @@ -5219,5 +5222,5 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): ) def test_build_stream_name_to_group(manifest, expected): """Test _build_stream_name_to_group correctly maps stream names to group names.""" - result = ConcurrentDeclarativeSource._build_stream_name_to_group(manifest) + result = ModelToComponentFactory._build_stream_name_to_group(manifest) assert result == expected From 40d53e75e04bf1172b827404cc43afd513ca5e7e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:56:59 +0000 Subject: [PATCH 16/30] refactor: resolve stream_groups from actual stream instances instead of factory - Removed _build_stream_name_to_group, set_stream_groups, _stream_name_to_group from factory - Factory no longer knows about stream_groups at all - Added _apply_stream_groups to ConcurrentDeclarativeSource: creates streams first, then sets block_simultaneous_read on matching DefaultStream instances - Added block_simultaneous_read setter on DefaultStream - Replaced mock-based tests with parametrized tests using real DefaultStream instances Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 30 ++++- .../parsers/model_to_component_factory.py | 42 ------- .../streams/concurrent/default_stream.py | 4 + .../test_model_to_component_factory.py | 109 ++---------------- .../test_concurrent_declarative_source.py | 69 ++++++----- 5 files changed, 81 insertions(+), 173 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 2eeee093e..fae94c7b7 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -82,6 +82,7 @@ from airbyte_cdk.sources.message.concurrent_repository import ConcurrentMessageRepository from airbyte_cdk.sources.message.repository import InMemoryMessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem from airbyte_cdk.sources.utils.slice_logger import ( AlwaysLogSliceLogger, @@ -405,8 +406,6 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - self._constructor.set_stream_groups(self._source_config) - prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) source_streams = [ @@ -422,8 +421,35 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i ) for stream_config in prepared_configs ] + + self._apply_stream_groups(source_streams) + return source_streams + def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: + """Set block_simultaneous_read on streams based on the manifest's stream_groups config. + + Iterates over the resolved manifest's stream_groups and matches group membership + against actual created stream instances by name. + """ + stream_groups = self._source_config.get("stream_groups", {}) + if not stream_groups: + return + + # Build stream_name -> group_name mapping from the resolved manifest + stream_name_to_group: Dict[str, str] = {} + for group_name, group_config in stream_groups.items(): + for stream_ref in group_config.get("streams", []): + if isinstance(stream_ref, dict): + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + + # Apply group to matching stream instances + for stream in streams: + if isinstance(stream, DefaultStream) and stream.name in stream_name_to_group: + stream.block_simultaneous_read = stream_name_to_group[stream.name] + @staticmethod def _initialize_cache_for_parent_streams( stream_configs: List[Dict[str, Any]], diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index a2ba0377f..560dd4056 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -705,50 +705,9 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) - self._stream_name_to_group: Dict[str, str] = {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] - def set_stream_groups(self, manifest: Mapping[str, Any]) -> None: - """Build and set the stream-name-to-group mapping from the manifest's stream_groups config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - """ - self._stream_name_to_group = self._build_stream_name_to_group(manifest) - - @staticmethod - def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: - """Build a mapping from stream name to group name based on the stream_groups manifest config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - - Returns: - A dict mapping stream name -> group name for streams that belong to a group. - """ - stream_name_to_group: Dict[str, str] = {} - stream_groups = manifest.get("stream_groups", {}) - if not stream_groups: - return stream_name_to_group - - for group_name, group_config in stream_groups.items(): - streams = group_config.get("streams", []) - for stream_ref in streams: - if isinstance(stream_ref, dict): - # After reference resolution, stream_ref is a full stream definition dict - stream_name = stream_ref.get("name", "") - if stream_name: - stream_name_to_group[stream_name] = group_name - elif isinstance(stream_ref, str): - # If not resolved (shouldn't happen normally), extract name from ref path - # e.g., "#/definitions/my_stream" -> "my_stream" - if stream_ref.startswith("#/definitions/"): - stream_name = stream_ref.split("/")[-1] - stream_name_to_group[stream_name] = group_name - - return stream_name_to_group - def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { AddedFieldDefinitionModel: self.create_added_field_definition, @@ -2186,7 +2145,6 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=self._stream_name_to_group.get(stream_name, ""), ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 5ca11eaf8..d679fe0ee 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -101,6 +101,10 @@ def block_simultaneous_read(self) -> str: """Returns the blocking group name for this stream, or empty string if no blocking""" return self._block_simultaneous_read + @block_simultaneous_read.setter + def block_simultaneous_read(self, value: str) -> None: + self._block_simultaneous_read = value + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index d27d23613..add9a1c42 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5282,7 +5282,12 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that stream_groups in the manifest flow through to DefaultStream.block_simultaneous_read.""" + """Test that factory-created streams default to empty block_simultaneous_read. + + The factory no longer handles stream_groups — that's done by + ConcurrentDeclarativeSource._apply_stream_groups after stream creation. + This test verifies the factory creates streams without group info. + """ content = """ definitions: parent_stream: @@ -5311,75 +5316,6 @@ def test_block_simultaneous_read_from_stream_groups(): properties: id: type: string - - child_stream: - type: DeclarativeStream - name: "child" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/child" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - partition_router: - type: SubstreamPartitionRouter - parent_stream_configs: - - type: ParentStreamConfig - stream: "#/definitions/parent_stream" - parent_key: "id" - partition_field: "parent_id" - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - parent_id: - type: string - - no_block_stream: - type: DeclarativeStream - name: "no_block" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/no_block" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - - stream_groups: - issues_endpoint: - streams: - - "#/definitions/parent_stream" - action: BlockSimultaneousSyncsAction """ config = {"api_key": "test_key"} @@ -5387,45 +5323,18 @@ def test_block_simultaneous_read_from_stream_groups(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Use the factory's set_stream_groups to resolve stream_groups from the manifest - factory_with_groups = ModelToComponentFactory() - factory_with_groups.set_stream_groups(resolved_manifest) + factory = ModelToComponentFactory() - # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["definitions"]["parent_stream"], {} ) - parent_stream: DefaultStream = factory_with_groups.create_component( + parent_stream: DefaultStream = factory.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config ) assert isinstance(parent_stream, DefaultStream) assert parent_stream.name == "parent" - assert parent_stream.block_simultaneous_read == "issues_endpoint" - - # Test child stream is NOT in the group (to avoid deadlock with parent) - child_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["definitions"]["child_stream"], {} - ) - child_stream: DefaultStream = factory_with_groups.create_component( - model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config - ) - - assert isinstance(child_stream, DefaultStream) - assert child_stream.name == "child" - assert child_stream.block_simultaneous_read == "" - - # Test stream not in any group defaults to empty string - no_block_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["definitions"]["no_block_stream"], {} - ) - no_block_stream: DefaultStream = factory_with_groups.create_component( - model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config - ) - - assert isinstance(no_block_stream, DefaultStream) - assert no_block_stream.name == "no_block" - assert no_block_stream.block_simultaneous_read == "" + assert parent_stream.block_simultaneous_read == "" def get_schema_loader(stream: DefaultStream): diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 7676f8e2d..0e07b20f0 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -56,14 +56,12 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import ( ClientSideIncrementalRecordFilterDecorator, ) -from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( - ModelToComponentFactory, -) from airbyte_cdk.sources.declarative.partition_routers import AsyncJobPartitionRouter from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, ) +from airbyte_cdk.sources.message.repository import InMemoryMessageRepository from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.checkpoint import Cursor from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor @@ -5155,17 +5153,37 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): list(source.read(logger=source.logger, config=input_config, catalog=catalog, state=[])) +def _make_default_stream(name: str) -> DefaultStream: + """Create a minimal DefaultStream instance for testing.""" + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + + cursor = FinalStateCursor( + stream_name=name, stream_namespace=None, message_repository=InMemoryMessageRepository() + ) + return DefaultStream( + partition_generator=Mock(), + name=name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{name}"), + cursor=cursor, + ) + + @pytest.mark.parametrize( - "manifest,expected", + "source_config,stream_names,expected_groups", [ pytest.param( {}, - {}, + ["my_stream"], + {"my_stream": ""}, id="no_stream_groups", ), pytest.param( {"stream_groups": {}}, - {}, + ["my_stream"], + {"my_stream": ""}, id="empty_stream_groups", ), pytest.param( @@ -5180,16 +5198,15 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): } } }, - {"deals": "crm_objects", "companies": "crm_objects"}, - id="resolved_stream_refs", + ["deals", "companies", "no_group"], + {"deals": "crm_objects", "companies": "crm_objects", "no_group": ""}, + id="single_group_with_unmatched_stream", ), pytest.param( { "stream_groups": { "group_a": { - "streams": [ - {"name": "stream1", "type": "DeclarativeStream"}, - ], + "streams": [{"name": "stream1", "type": "DeclarativeStream"}], "action": {"type": "BlockSimultaneousSyncsAction"}, }, "group_b": { @@ -5201,26 +5218,20 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): }, } }, + ["stream1", "stream2", "stream3"], {"stream1": "group_a", "stream2": "group_b", "stream3": "group_b"}, id="multiple_groups", ), - pytest.param( - { - "stream_groups": { - "fallback_group": { - "streams": [ - "#/definitions/my_stream", - ], - "action": {"type": "BlockSimultaneousSyncsAction"}, - } - } - }, - {"my_stream": "fallback_group"}, - id="unresolved_string_refs_fallback", - ), ], ) -def test_build_stream_name_to_group(manifest, expected): - """Test _build_stream_name_to_group correctly maps stream names to group names.""" - result = ModelToComponentFactory._build_stream_name_to_group(manifest) - assert result == expected +def test_apply_stream_groups(source_config, stream_names, expected_groups): + """Test _apply_stream_groups sets block_simultaneous_read on matching stream instances.""" + streams = [_make_default_stream(name) for name in stream_names] + + source = Mock() + source._source_config = source_config + + ConcurrentDeclarativeSource._apply_stream_groups(source, streams) + + for stream in streams: + assert stream.block_simultaneous_read == expected_groups[stream.name] From 6726050dd816fb6784b34b967c352e85512037e2 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 4 Mar 2026 18:40:21 +0200 Subject: [PATCH 17/30] Fix stream format in schema --- .../sources/concurrent_source/concurrent_read_processor.py | 4 ++-- .../sources/declarative/declarative_component_schema.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 95bae264c..7a6f77ecb 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -260,7 +260,8 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: stream_name = stream.name stream_group = self._stream_block_simultaneous_read.get(stream_name, "") - # Check if this stream has a blocking group and is already active + # Check if this stream has a blocking group and is already active as parent stream + # (i.e. being read from during partition generation for another stream) if stream_group and stream_name in self._active_stream_names: # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) @@ -318,7 +319,6 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: self._logger.debug(f"Added '{stream_name}' to active group '{stream_group}'") # Also mark all parent streams as active (they will be read from during partition generation) - parent_streams = self._collect_all_parent_stream_names(stream_name) for parent_stream_name in parent_streams: parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") if parent_group: diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 9e28a74af..0de656af4 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -85,11 +85,11 @@ definitions: streams: title: Streams description: > - List of references to streams that belong to this group. Use JSON references - to stream definitions (e.g., "#/definitions/my_stream"). + List of references to streams that belong to this group. type: array items: - type: string + anyOf: + - "$ref": "#/definitions/DeclarativeStream" action: title: Action description: The action to apply to streams in this group. From 76a29f24957a3c84d404bfc70996396a20693f9a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:44:09 +0000 Subject: [PATCH 18/30] refactor: add get_partition_router() helper to DefaultStream Replace hasattr chain in ConcurrentReadProcessor._collect_all_parent_stream_names with DefaultStream.get_partition_router() that safely traverses the internal partition_generator -> stream_slicer -> partition_router chain using isinstance checks. Co-Authored-By: unknown <> --- .../concurrent_read_processor.py | 38 +++++++------------ .../streams/concurrent/default_stream.py | 21 +++++++++- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 7a6f77ecb..3c7f0eba2 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -373,42 +373,30 @@ def _is_stream_done(self, stream_name: str) -> bool: return stream_name in self._streams_done def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: - """ - Recursively collect all parent stream names for a given stream. - For example, if we have: epics -> issues -> comments - Then for comments, this returns {issues, epics} + """Recursively collect all parent stream names for a given stream. - :param stream_name: The stream to collect parents for - :return: Set of all parent stream names (recursively) + For example, if we have: epics -> issues -> comments + Then for comments, this returns {issues, epics}. """ + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream + parent_names: Set[str] = set() stream = self._stream_name_to_instance.get(stream_name) if not stream: return parent_names - # Get partition router if it exists (this is where parent streams are defined) - partition_router = None - - # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) - if ( - hasattr(stream, "_stream_partition_generator") - and hasattr(stream._stream_partition_generator, "_stream_slicer") - and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router") - ): - partition_router = stream._stream_partition_generator._stream_slicer._partition_router - # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks - elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): - partition_router = stream.retriever.partition_router + partition_router = ( + stream.get_partition_router() if isinstance(stream, DefaultStream) else None + ) - # SubstreamPartitionRouter has parent_stream_configs - if partition_router and hasattr(partition_router, "parent_stream_configs"): + if isinstance(partition_router, SubstreamPartitionRouter): for parent_config in partition_router.parent_stream_configs: - parent_stream = parent_config.stream - parent_name = parent_stream.name + parent_name = parent_config.stream.name parent_names.add(parent_name) - - # Recursively collect grandparents, great-grandparents, etc. parent_names.update(self._collect_all_parent_stream_names(parent_name)) return parent_names diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index d679fe0ee..4c7267315 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -3,7 +3,10 @@ # from logging import Logger -from typing import Any, Callable, Iterable, List, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Mapping, Optional, Union + +if TYPE_CHECKING: + from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream @@ -105,6 +108,22 @@ def block_simultaneous_read(self) -> str: def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value + def get_partition_router(self) -> "PartitionRouter | None": + """Return the partition router for this stream, or None if not available.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + StreamSlicerPartitionGenerator, + ) + + if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): + return None + stream_slicer = self._stream_partition_generator._stream_slicer + if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): + return None + return stream_slicer._partition_router + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. From b6be8c89b3ca0795304388367edf98070b7234d6 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:52:33 +0000 Subject: [PATCH 19/30] feat: validate no parent-child streams share a group to prevent deadlock _apply_stream_groups now checks that no stream shares a group with any of its parent streams (via get_partition_router). Raises ValueError at config time if a deadlock-causing configuration is detected. Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 24 +++- .../test_concurrent_declarative_source.py | 116 ++++++++++++++++++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index fae94c7b7..15fa40fe4 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -430,8 +430,13 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: """Set block_simultaneous_read on streams based on the manifest's stream_groups config. Iterates over the resolved manifest's stream_groups and matches group membership - against actual created stream instances by name. + against actual created stream instances by name. Validates that no stream shares a + group with any of its parent streams, which would cause a deadlock. """ + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + stream_groups = self._source_config.get("stream_groups", {}) if not stream_groups: return @@ -445,6 +450,23 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if stream_name: stream_name_to_group[stream_name] = group_name + # Validate no stream shares a group with its parent streams + for stream in streams: + if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: + continue + partition_router = stream.get_partition_router() + if not isinstance(partition_router, SubstreamPartitionRouter): + continue + group_name = stream_name_to_group[stream.name] + for parent_config in partition_router.parent_stream_configs: + parent_name = parent_config.stream.name + if stream_name_to_group.get(parent_name) == group_name: + raise ValueError( + f"Stream '{stream.name}' and its parent stream '{parent_name}' " + f"are both in group '{group_name}'. " + f"A child stream must not share a group with its parent to avoid deadlock." + ) + # Apply group to matching stream instances for stream in streams: if isinstance(stream, DefaultStream) and stream.name in stream_name_to_group: diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 0e07b20f0..f1b4143a3 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5171,6 +5171,75 @@ def _make_default_stream(name: str) -> DefaultStream: ) +def _make_child_stream_with_parent(child_name: str, parent_stream: DefaultStream) -> DefaultStream: + """Create a DefaultStream that has a SubstreamPartitionRouter pointing to parent_stream.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentCursorFactory, + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + ParentStreamConfig, + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, + StreamSlicerPartitionGenerator, + ) + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + EpochValueConcurrentStreamStateConverter, + ) + + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=parent_stream, + parent_key="id", + partition_field="parent_id", + config={}, + parameters={}, + ) + ], + config={}, + parameters={}, + ) + + cursor_factory = ConcurrentCursorFactory(lambda *args, **kwargs: Mock()) + message_repository = InMemoryMessageRepository() + state_converter = EpochValueConcurrentStreamStateConverter() + + per_partition_cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory, + partition_router=partition_router, + stream_name=child_name, + stream_namespace=None, + stream_state={}, + message_repository=message_repository, + connector_state_manager=Mock(), + connector_state_converter=state_converter, + cursor_field=Mock(cursor_field_key="updated_at"), + ) + + partition_factory = Mock(spec=DeclarativePartitionFactory) + partition_generator = StreamSlicerPartitionGenerator( + partition_factory=partition_factory, + stream_slicer=per_partition_cursor, + ) + + cursor = FinalStateCursor( + stream_name=child_name, stream_namespace=None, message_repository=message_repository + ) + return DefaultStream( + partition_generator=partition_generator, + name=child_name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{child_name}"), + cursor=cursor, + ) + + @pytest.mark.parametrize( "source_config,stream_names,expected_groups", [ @@ -5235,3 +5304,50 @@ def test_apply_stream_groups(source_config, stream_names, expected_groups): for stream in streams: assert stream.block_simultaneous_read == expected_groups[stream.name] + + +def test_apply_stream_groups_raises_on_parent_child_in_same_group(): + """Test _apply_stream_groups raises ValueError when a child and its parent are in the same group.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "parent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + +def test_apply_stream_groups_allows_parent_child_in_different_groups(): + """Test _apply_stream_groups allows a child and its parent in different groups.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "group_a": { + "streams": [{"name": "parent_stream", "type": "DeclarativeStream"}], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + "group_b": { + "streams": [{"name": "child_stream", "type": "DeclarativeStream"}], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + } + } + + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + assert parent.block_simultaneous_read == "group_a" + assert child.block_simultaneous_read == "group_b" From 3f5a3549f0d523fa47122b6bfdf483d73ed4a890 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:09:13 +0000 Subject: [PATCH 20/30] feat: assert partition generation queue is empty when all streams are done Adds a safety check in is_done() that raises AirbyteTracedException (system_error) if streams remain in the partition generation queue after all streams are marked done. Also moves inline imports to module level and updates test mocks to use DefaultStream with get_partition_router(). Co-Authored-By: unknown <> --- .../concurrent_read_processor.py | 18 ++++-- .../test_concurrent_read_processor.py | 60 ++++++++++++++++--- 2 files changed, 65 insertions(+), 13 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 3c7f0eba2..a205ed388 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -13,8 +13,12 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -356,6 +360,15 @@ def is_done(self) -> bool: for stream_name in self._stream_name_to_instance.keys() ] ) + if is_done and self._stream_instances_to_start_partition_generation: + stuck_stream_names = [ + s.name for s in self._stream_instances_to_start_partition_generation + ] + raise AirbyteTracedException( + message="Partition generation queue is not empty after all streams completed.", + internal_message=f"Streams {stuck_stream_names} remained in the partition generation queue after all streams were marked done.", + failure_type=FailureType.system_error, + ) if is_done and self._exceptions_per_stream_name: error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name) self._logger.info(error_message) @@ -378,11 +391,6 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: For example, if we have: epics -> issues -> comments Then for comments, this returns {issues, epics}. """ - from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( - SubstreamPartitionRouter, - ) - from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream - parent_names: Set[str] = set() stream = self._stream_name_to_instance.get(stream_name) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index a8db1e65f..cd4c32d41 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -29,8 +29,12 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.message import LogMessage, MessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -873,18 +877,22 @@ def _create_mock_stream(self, name: str, block_simultaneous_read: str = ""): def _create_mock_stream_with_parent( self, name: str, parent_stream, block_simultaneous_read: str = "" ): - """Helper to create a mock stream with a parent stream""" - stream = self._create_mock_stream(name, block_simultaneous_read) + """Helper to create a mock stream with a parent stream.""" + stream = Mock(spec=DefaultStream) + stream.name = name + stream.block_simultaneous_read = block_simultaneous_read + stream.as_airbyte_stream.return_value = AirbyteStream( + name=name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + stream.cursor.ensure_at_least_one_state_emitted = Mock() - # Mock the retriever and partition router for parent relationship - mock_retriever = Mock() - mock_partition_router = Mock() + mock_partition_router = Mock(spec=SubstreamPartitionRouter) mock_parent_config = Mock() mock_parent_config.stream = parent_stream - mock_partition_router.parent_stream_configs = [mock_parent_config] - mock_retriever.partition_router = mock_partition_router - stream.retriever = mock_retriever + stream.get_partition_router.return_value = mock_partition_router return stream @@ -1447,3 +1455,39 @@ def test_child_starts_after_parent_completes_via_partition_complete_sentinel(sel ] assert len(started_messages) == 1 assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" + + +def test_is_done_raises_when_partition_generation_queue_not_empty(): + """Test is_done raises AirbyteTracedException if streams remain in the partition generation queue.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + stream = Mock(spec=AbstractStream) + stream.name = "stuck_stream" + stream.block_simultaneous_read = "" + stream.as_airbyte_stream.return_value = AirbyteStream( + name="stuck_stream", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + handler = ConcurrentReadProcessor( + [stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + # Artificially mark the stream as done without removing it from the partition generation queue + handler._streams_done.add("stuck_stream") + + with pytest.raises(AirbyteTracedException, match="remained in the partition generation queue"): + handler.is_done() From 4b8b1418655df6e4ec514dc9fdd53e65092d9776 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:14:55 +0000 Subject: [PATCH 21/30] refactor: move inline imports to module level in default_stream.py and concurrent_declarative_source.py Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 7 +++---- .../streams/concurrent/default_stream.py | 21 ++++++++----------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 15fa40fe4..5ae1d232d 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -76,6 +76,9 @@ from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.declarative.resolvers import COMPONENTS_RESOLVER_TYPE_MAPPING from airbyte_cdk.sources.declarative.spec.spec import Spec from airbyte_cdk.sources.declarative.types import Config, ConnectionDefinition @@ -433,10 +436,6 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: against actual created stream instances by name. Validates that no stream shares a group with any of its parent streams, which would cause a deadlock. """ - from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( - SubstreamPartitionRouter, - ) - stream_groups = self._source_config.get("stream_groups", {}) if not stream_groups: return diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 4c7267315..6cc6e44d4 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -3,12 +3,16 @@ # from logging import Logger -from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Mapping, Optional, Union - -if TYPE_CHECKING: - from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from typing import Any, Callable, Iterable, List, Mapping, Optional, Union from airbyte_cdk.models import AirbyteStream, SyncMode +from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentPerPartitionCursor, +) +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + StreamSlicerPartitionGenerator, +) from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField @@ -108,15 +112,8 @@ def block_simultaneous_read(self) -> str: def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value - def get_partition_router(self) -> "PartitionRouter | None": + def get_partition_router(self) -> PartitionRouter | None: """Return the partition router for this stream, or None if not available.""" - from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( - ConcurrentPerPartitionCursor, - ) - from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( - StreamSlicerPartitionGenerator, - ) - if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer From 7306516138b25f18b411be3e34b200d68c148dd5 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:26:23 +0000 Subject: [PATCH 22/30] fix: unwrap GroupingPartitionRouter in get_partition_router() to detect parent streams Co-Authored-By: unknown <> --- .../sources/streams/concurrent/default_stream.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 6cc6e44d4..a896cfbad 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -9,6 +9,9 @@ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( ConcurrentPerPartitionCursor, ) +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, @@ -113,13 +116,20 @@ def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value def get_partition_router(self) -> PartitionRouter | None: - """Return the partition router for this stream, or None if not available.""" + """Return the partition router for this stream, or None if not available. + + If the router is a GroupingPartitionRouter, unwraps it to return the + underlying router so callers can inspect parent stream relationships. + """ if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): return None - return stream_slicer._partition_router + router = stream_slicer._partition_router + if isinstance(router, GroupingPartitionRouter): + return router.underlying_partition_router + return router def check_availability(self) -> StreamAvailability: """ From c4c92708080eab5839c0bf1ddf5edd88ec842c85 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:29:42 +0000 Subject: [PATCH 23/30] fix: handle GroupingPartitionRouter at call sites instead of in get_partition_router() Co-Authored-By: unknown <> --- .../concurrent_source/concurrent_read_processor.py | 5 +++++ .../declarative/concurrent_declarative_source.py | 5 +++++ .../sources/streams/concurrent/default_stream.py | 14 ++------------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index a205ed388..398cca2e4 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -13,6 +13,9 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -400,6 +403,8 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: partition_router = ( stream.get_partition_router() if isinstance(stream, DefaultStream) else None ) + if isinstance(partition_router, GroupingPartitionRouter): + partition_router = partition_router.underlying_partition_router if isinstance(partition_router, SubstreamPartitionRouter): for parent_config in partition_router.parent_stream_configs: diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 5ae1d232d..6d3b570ab 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -76,6 +76,9 @@ from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -454,6 +457,8 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: continue partition_router = stream.get_partition_router() + if isinstance(partition_router, GroupingPartitionRouter): + partition_router = partition_router.underlying_partition_router if not isinstance(partition_router, SubstreamPartitionRouter): continue group_name = stream_name_to_group[stream.name] diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index a896cfbad..6cc6e44d4 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -9,9 +9,6 @@ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( ConcurrentPerPartitionCursor, ) -from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( - GroupingPartitionRouter, -) from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, @@ -116,20 +113,13 @@ def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value def get_partition_router(self) -> PartitionRouter | None: - """Return the partition router for this stream, or None if not available. - - If the router is a GroupingPartitionRouter, unwraps it to return the - underlying router so callers can inspect parent stream relationships. - """ + """Return the partition router for this stream, or None if not available.""" if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): return None - router = stream_slicer._partition_router - if isinstance(router, GroupingPartitionRouter): - return router.underlying_partition_router - return router + return stream_slicer._partition_router def check_availability(self) -> StreamAvailability: """ From f1e020ba8b690d8ee67e016e6fa17c1d34e97481 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:34:21 +0000 Subject: [PATCH 24/30] feat: check active_groups is empty in is_done() safety check Co-Authored-By: unknown <> --- .../sources/concurrent_source/concurrent_read_processor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 398cca2e4..a78905e72 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -372,6 +372,12 @@ def is_done(self) -> bool: internal_message=f"Streams {stuck_stream_names} remained in the partition generation queue after all streams were marked done.", failure_type=FailureType.system_error, ) + if is_done and self._active_groups: + raise AirbyteTracedException( + message="Active stream groups are not empty after all streams completed.", + internal_message=f"Groups {dict(self._active_groups)} still active after all streams were marked done.", + failure_type=FailureType.system_error, + ) if is_done and self._exceptions_per_stream_name: error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name) self._logger.info(error_message) From e027b73b69860bbc47106d00f801a906b3368867 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:38:37 +0000 Subject: [PATCH 25/30] test: add missing unit tests for GroupingPartitionRouter, active_groups check, and get_partition_router Co-Authored-By: unknown <> --- .../test_concurrent_declarative_source.py | 144 ++++++++++++++++++ .../test_concurrent_read_processor.py | 85 +++++++++++ 2 files changed, 229 insertions(+) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index f1b4143a3..ca8e474f5 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5351,3 +5351,147 @@ def test_apply_stream_groups_allows_parent_child_in_different_groups(): assert parent.block_simultaneous_read == "group_a" assert child.block_simultaneous_read == "group_b" + + +def _make_child_stream_with_grouping_router( + child_name: str, parent_stream: DefaultStream +) -> DefaultStream: + """Create a DefaultStream with GroupingPartitionRouter wrapping SubstreamPartitionRouter.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentCursorFactory, + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + ParentStreamConfig, + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, + StreamSlicerPartitionGenerator, + ) + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + EpochValueConcurrentStreamStateConverter, + ) + + substream_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=parent_stream, + parent_key="id", + partition_field="parent_id", + config={}, + parameters={}, + ) + ], + config={}, + parameters={}, + ) + + grouping_router = GroupingPartitionRouter( + group_size=10, + underlying_partition_router=substream_router, + config={}, + ) + + cursor_factory = ConcurrentCursorFactory(lambda *args, **kwargs: Mock()) + message_repository = InMemoryMessageRepository() + state_converter = EpochValueConcurrentStreamStateConverter() + + per_partition_cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory, + partition_router=grouping_router, + stream_name=child_name, + stream_namespace=None, + stream_state={}, + message_repository=message_repository, + connector_state_manager=Mock(), + connector_state_converter=state_converter, + cursor_field=Mock(cursor_field_key="updated_at"), + ) + + partition_factory = Mock(spec=DeclarativePartitionFactory) + partition_generator = StreamSlicerPartitionGenerator( + partition_factory=partition_factory, + stream_slicer=per_partition_cursor, + ) + + cursor = FinalStateCursor( + stream_name=child_name, stream_namespace=None, message_repository=message_repository + ) + return DefaultStream( + partition_generator=partition_generator, + name=child_name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{child_name}"), + cursor=cursor, + ) + + +def test_apply_stream_groups_raises_on_parent_child_in_same_group_with_grouping_router(): + """Test _apply_stream_groups detects deadlock when GroupingPartitionRouter wraps SubstreamPartitionRouter.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_grouping_router("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "parent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + +@pytest.mark.parametrize( + "stream_factory,expected_type", + [ + pytest.param( + lambda: _make_default_stream("plain_stream"), + type(None), + id="no_partition_router_returns_none", + ), + pytest.param( + lambda: _make_child_stream_with_parent("child", _make_default_stream("parent")), + "SubstreamPartitionRouter", + id="substream_returns_substream_router", + ), + pytest.param( + lambda: _make_child_stream_with_grouping_router( + "child", _make_default_stream("parent") + ), + "GroupingPartitionRouter", + id="grouping_returns_grouping_router", + ), + ], +) +def test_get_partition_router(stream_factory, expected_type): + """Test DefaultStream.get_partition_router returns the correct router type.""" + from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + + stream = stream_factory() + router = stream.get_partition_router() + + if expected_type is type(None): + assert router is None + elif expected_type == "SubstreamPartitionRouter": + assert isinstance(router, SubstreamPartitionRouter) + elif expected_type == "GroupingPartitionRouter": + assert isinstance(router, GroupingPartitionRouter) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index cd4c32d41..ccccacf1e 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -29,6 +29,9 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -1491,3 +1494,85 @@ def test_is_done_raises_when_partition_generation_queue_not_empty(): with pytest.raises(AirbyteTracedException, match="remained in the partition generation queue"): handler.is_done() + + +def test_is_done_raises_when_active_groups_not_empty(): + """Test is_done raises AirbyteTracedException if active groups remain after all streams complete.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + stream = Mock(spec=AbstractStream) + stream.name = "stuck_stream" + stream.block_simultaneous_read = "my_group" + stream.as_airbyte_stream.return_value = AirbyteStream( + name="stuck_stream", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + handler = ConcurrentReadProcessor( + [stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + # Mark stream as done but leave the group active (simulating a bug) + handler._streams_done.add("stuck_stream") + handler._stream_instances_to_start_partition_generation.clear() + handler._active_groups["my_group"] = {"stuck_stream"} + + with pytest.raises( + AirbyteTracedException, match="still active after all streams were marked done" + ): + handler.is_done() + + +def test_collect_parent_stream_names_unwraps_grouping_partition_router(): + """Test _collect_all_parent_stream_names unwraps GroupingPartitionRouter to find parents.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + parent_stream = Mock(spec=AbstractStream) + parent_stream.name = "parent" + parent_stream.block_simultaneous_read = "" + + # Child has a GroupingPartitionRouter wrapping a SubstreamPartitionRouter + child_stream = Mock(spec=DefaultStream) + child_stream.name = "child" + child_stream.block_simultaneous_read = "" + + mock_substream_router = Mock(spec=SubstreamPartitionRouter) + mock_parent_config = Mock() + mock_parent_config.stream = parent_stream + mock_substream_router.parent_stream_configs = [mock_parent_config] + + mock_grouping_router = Mock(spec=GroupingPartitionRouter) + mock_grouping_router.underlying_partition_router = mock_substream_router + child_stream.get_partition_router.return_value = mock_grouping_router + + handler = ConcurrentReadProcessor( + [parent_stream, child_stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + parent_names = handler._collect_all_parent_stream_names("child") + assert parent_names == {"parent"} From 6f520732b4cc3ca2f6405b4023312bc7ae05f11f Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 20:04:25 +0000 Subject: [PATCH 26/30] fix: make deadlock validation check all ancestors, not just direct parents Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 33 +++++++++++++------ .../test_concurrent_declarative_source.py | 23 +++++++++++++ 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 6d3b570ab..292615692 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -452,21 +452,34 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if stream_name: stream_name_to_group[stream_name] = group_name - # Validate no stream shares a group with its parent streams + # Validate no stream shares a group with any of its ancestor streams + stream_name_to_instance: Dict[str, AbstractStream] = {s.name: s for s in streams} + + def _collect_all_ancestor_names(stream_name: str) -> Set[str]: + """Recursively collect all ancestor stream names.""" + ancestors: Set[str] = set() + inst = stream_name_to_instance.get(stream_name) + if not isinstance(inst, DefaultStream): + return ancestors + router = inst.get_partition_router() + if isinstance(router, GroupingPartitionRouter): + router = router.underlying_partition_router + if not isinstance(router, SubstreamPartitionRouter): + return ancestors + for parent_config in router.parent_stream_configs: + parent_name = parent_config.stream.name + ancestors.add(parent_name) + ancestors.update(_collect_all_ancestor_names(parent_name)) + return ancestors + for stream in streams: if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: continue - partition_router = stream.get_partition_router() - if isinstance(partition_router, GroupingPartitionRouter): - partition_router = partition_router.underlying_partition_router - if not isinstance(partition_router, SubstreamPartitionRouter): - continue group_name = stream_name_to_group[stream.name] - for parent_config in partition_router.parent_stream_configs: - parent_name = parent_config.stream.name - if stream_name_to_group.get(parent_name) == group_name: + for ancestor_name in _collect_all_ancestor_names(stream.name): + if stream_name_to_group.get(ancestor_name) == group_name: raise ValueError( - f"Stream '{stream.name}' and its parent stream '{parent_name}' " + f"Stream '{stream.name}' and its parent stream '{ancestor_name}' " f"are both in group '{group_name}'. " f"A child stream must not share a group with its parent to avoid deadlock." ) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index ca8e474f5..bf1f61610 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5433,6 +5433,29 @@ def _make_child_stream_with_grouping_router( ) +def test_apply_stream_groups_raises_on_grandparent_child_in_same_group(): + """Test _apply_stream_groups detects deadlock when a grandchild and grandparent share a group.""" + grandparent = _make_default_stream("grandparent_stream") + parent = _make_child_stream_with_parent("parent_stream", grandparent) + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "grandparent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [grandparent, parent, child]) + + def test_apply_stream_groups_raises_on_parent_child_in_same_group_with_grouping_router(): """Test _apply_stream_groups detects deadlock when GroupingPartitionRouter wraps SubstreamPartitionRouter.""" parent = _make_default_stream("parent_stream") From e7acac830931b549a5db72ae788939e72dbf0afe Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:07:57 +0000 Subject: [PATCH 27/30] style: alphabetize StreamGroup and BlockSimultaneousSyncsAction in schema definitions Co-Authored-By: unknown <> --- .../declarative_component_schema.yaml | 74 +++++++++---------- .../models/declarative_component_schema.py | 34 ++++----- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 0de656af4..da93af6c5 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -72,43 +72,6 @@ properties: description: A description of the connector. It will be presented on the Source documentation page. additionalProperties: false definitions: - StreamGroup: - title: Stream Group - description: > - A group of streams that share a common resource and should not be read simultaneously. - Streams in the same group will be blocked from concurrent reads based on the specified action. - type: object - required: - - streams - - action - properties: - streams: - title: Streams - description: > - List of references to streams that belong to this group. - type: array - items: - anyOf: - - "$ref": "#/definitions/DeclarativeStream" - action: - title: Action - description: The action to apply to streams in this group. - "$ref": "#/definitions/BlockSimultaneousSyncsAction" - BlockSimultaneousSyncsAction: - title: Block Simultaneous Syncs Action - description: > - Action that prevents streams in the same group from being read concurrently. - When applied to a stream group, streams with this action will be deferred if - another stream in the same group is currently active. - This is useful for APIs that don't allow concurrent access to the same - endpoint or session. Only applies to ConcurrentDeclarativeSource. - type: object - required: - - type - properties: - type: - type: string - enum: [BlockSimultaneousSyncsAction] AddedFieldDefinition: title: Definition Of Field To Add description: Defines the field to add on a record. @@ -312,6 +275,21 @@ definitions: $parameters: type: object additionalProperties: true + BlockSimultaneousSyncsAction: + title: Block Simultaneous Syncs Action + description: > + Action that prevents streams in the same group from being read concurrently. + When applied to a stream group, streams with this action will be deferred if + another stream in the same group is currently active. + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: object + required: + - type + properties: + type: + type: string + enum: [BlockSimultaneousSyncsAction] SelectiveAuthenticator: title: Selective Authenticator description: Authenticator that selects concrete authenticator based on config property. @@ -4237,6 +4215,28 @@ definitions: - "$ref": "#/definitions/ConfigRemoveFields" - "$ref": "#/definitions/CustomConfigTransformation" default: [] + StreamGroup: + title: Stream Group + description: > + A group of streams that share a common resource and should not be read simultaneously. + Streams in the same group will be blocked from concurrent reads based on the specified action. + type: object + required: + - streams + - action + properties: + streams: + title: Streams + description: > + List of references to streams that belong to this group. + type: array + items: + anyOf: + - "$ref": "#/definitions/DeclarativeStream" + action: + title: Action + description: The action to apply to streams in this group. + "$ref": "#/definitions/BlockSimultaneousSyncsAction" SubstreamPartitionRouter: title: Substream Partition Router description: Partition router that is used to retrieve records that have been partitioned according to records from the specified parent streams. An example of a parent stream is automobile brands and the substream would be the various car models associated with each branch. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 4175bf364..6083125a8 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -52,6 +52,10 @@ class BearerAuthenticator(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class BlockSimultaneousSyncsAction(BaseModel): + type: Literal["BlockSimultaneousSyncsAction"] + + class DynamicStreamCheckConfig(BaseModel): type: Literal["DynamicStreamCheckConfig"] dynamic_stream_name: str = Field( @@ -2354,23 +2358,6 @@ class Config: ) -class BlockSimultaneousSyncsAction(BaseModel): - type: Literal["BlockSimultaneousSyncsAction"] - - -class StreamGroup(BaseModel): - streams: List[str] = Field( - ..., - description='List of references to streams that belong to this group. Use JSON references to stream definitions (e.g., "#/definitions/my_stream").', - title="Streams", - ) - action: BlockSimultaneousSyncsAction = Field( - ..., - description="The action to apply to streams in this group.", - title="Action", - ) - - class Spec(BaseModel): type: Literal["Spec"] connection_specification: Dict[str, Any] = Field( @@ -3123,6 +3110,19 @@ class AsyncRetriever(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class StreamGroup(BaseModel): + streams: List[str] = Field( + ..., + description='List of references to streams that belong to this group. Use JSON references to stream definitions (e.g., "#/definitions/my_stream").', + title="Streams", + ) + action: BlockSimultaneousSyncsAction = Field( + ..., + description="The action to apply to streams in this group.", + title="Action", + ) + + class SubstreamPartitionRouter(BaseModel): type: Literal["SubstreamPartitionRouter"] parent_stream_configs: List[ParentStreamConfig] = Field( From b56b37682a54e17df95bff24390d2794fd84b04d Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:16:12 +0000 Subject: [PATCH 28/30] style: move BlockSimultaneousSyncsAction next to StreamGroup for easier reading Co-Authored-By: unknown <> --- .../declarative_component_schema.yaml | 30 +++++++++---------- .../models/declarative_component_schema.py | 8 ++--- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index da93af6c5..87e87ebcb 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -275,21 +275,6 @@ definitions: $parameters: type: object additionalProperties: true - BlockSimultaneousSyncsAction: - title: Block Simultaneous Syncs Action - description: > - Action that prevents streams in the same group from being read concurrently. - When applied to a stream group, streams with this action will be deferred if - another stream in the same group is currently active. - This is useful for APIs that don't allow concurrent access to the same - endpoint or session. Only applies to ConcurrentDeclarativeSource. - type: object - required: - - type - properties: - type: - type: string - enum: [BlockSimultaneousSyncsAction] SelectiveAuthenticator: title: Selective Authenticator description: Authenticator that selects concrete authenticator based on config property. @@ -4237,6 +4222,21 @@ definitions: title: Action description: The action to apply to streams in this group. "$ref": "#/definitions/BlockSimultaneousSyncsAction" + BlockSimultaneousSyncsAction: + title: Block Simultaneous Syncs Action + description: > + Action that prevents streams in the same group from being read concurrently. + When applied to a stream group, streams with this action will be deferred if + another stream in the same group is currently active. + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: object + required: + - type + properties: + type: + type: string + enum: [BlockSimultaneousSyncsAction] SubstreamPartitionRouter: title: Substream Partition Router description: Partition router that is used to retrieve records that have been partitioned according to records from the specified parent streams. An example of a parent stream is automobile brands and the substream would be the various car models associated with each branch. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 6083125a8..bc884c490 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -52,10 +52,6 @@ class BearerAuthenticator(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") -class BlockSimultaneousSyncsAction(BaseModel): - type: Literal["BlockSimultaneousSyncsAction"] - - class DynamicStreamCheckConfig(BaseModel): type: Literal["DynamicStreamCheckConfig"] dynamic_stream_name: str = Field( @@ -3110,6 +3106,10 @@ class AsyncRetriever(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class BlockSimultaneousSyncsAction(BaseModel): + type: Literal["BlockSimultaneousSyncsAction"] + + class StreamGroup(BaseModel): streams: List[str] = Field( ..., From 055655a4f7c7e872eb3cdcd60a0ba4492c0c945b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:23:43 +0000 Subject: [PATCH 29/30] fix: update test regex to match actual error message after rebase Co-Authored-By: alfredo.garcia@airbyte.io --- .../streams/concurrent/test_concurrent_read_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index ccccacf1e..a1523620a 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1492,7 +1492,7 @@ def test_is_done_raises_when_partition_generation_queue_not_empty(): # Artificially mark the stream as done without removing it from the partition generation queue handler._streams_done.add("stuck_stream") - with pytest.raises(AirbyteTracedException, match="remained in the partition generation queue"): + with pytest.raises(AirbyteTracedException, match="Partition generation queue is not empty after all streams completed"): handler.is_done() From 22ef57b50f0b0eb3254e3627b260b4ababf9b370 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 19 Mar 2026 15:38:26 +0000 Subject: [PATCH 30/30] fix: correct ruff formatting and update active_groups test regex to match actual error message Co-Authored-By: alfredo.garcia@airbyte.io --- .../streams/concurrent/test_concurrent_read_processor.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index a1523620a..910111a05 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1492,7 +1492,10 @@ def test_is_done_raises_when_partition_generation_queue_not_empty(): # Artificially mark the stream as done without removing it from the partition generation queue handler._streams_done.add("stuck_stream") - with pytest.raises(AirbyteTracedException, match="Partition generation queue is not empty after all streams completed"): + with pytest.raises( + AirbyteTracedException, + match="Partition generation queue is not empty after all streams completed", + ): handler.is_done() @@ -1531,7 +1534,8 @@ def test_is_done_raises_when_active_groups_not_empty(): handler._active_groups["my_group"] = {"stuck_stream"} with pytest.raises( - AirbyteTracedException, match="still active after all streams were marked done" + AirbyteTracedException, + match="Active stream groups are not empty after all streams completed", ): handler.is_done()