From 52900d1caf089aea03aa63bac1f2a54d3d82da43 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 30 Dec 2025 19:00:44 +0200 Subject: [PATCH 01/31] Add block_simultaneous_read to DefaultStream --- .../concurrent_read_processor.py | 161 +++++- .../concurrent_source/concurrent_source.py | 4 +- .../declarative_component_schema.yaml | 11 + .../models/declarative_component_schema.py | 5 + .../parsers/model_to_component_factory.py | 1 + .../streams/concurrent/abstract_stream.py | 11 + .../streams/concurrent/default_stream.py | 7 + .../test_model_to_component_factory.py | 138 ++++++ .../test_concurrent_read_processor.py | 469 ++++++++++++++++++ 9 files changed, 792 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 905999a4d..6873130c1 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -66,17 +66,49 @@ def __init__( self._streams_done: Set[str] = set() self._exceptions_per_stream_name: dict[str, List[Exception]] = {} + # Track which streams (by name) are currently active + # A stream is "active" if it's generating partitions or has partitions being read + self._active_stream_names: Set[str] = set() + + # Store which streams require blocking simultaneous reads + self._stream_block_simultaneous_read: Dict[str, bool] = { + stream.name: stream.block_simultaneous_read for stream in stream_instances_to_read_from + } + + for stream in stream_instances_to_read_from: + if stream.block_simultaneous_read: + self._logger.info( + f"Stream '{stream.name}' has block_simultaneous_read=True. " + f"Will defer starting this stream if it or its parents are active." + ) + def on_partition_generation_completed( self, sentinel: PartitionGenerationCompletedSentinel ) -> Iterable[AirbyteMessage]: """ This method is called when a partition generation is completed. 1. Remove the stream from the list of streams currently generating partitions - 2. If the stream is done, mark it as such and return a stream status message - 3. If there are more streams to read from, start the next partition generator + 2. Deactivate parent streams (they were only needed for partition generation) + 3. If the stream is done, mark it as such and return a stream status message + 4. If there are more streams to read from, start the next partition generator """ stream_name = sentinel.stream.name self._streams_currently_generating_partitions.remove(sentinel.stream.name) + + # Deactivate all parent streams now that partition generation is complete + # Parents were only needed to generate slices, they can now be reused + parent_streams = self._collect_all_parent_stream_names(stream_name) + for parent_stream_name in parent_streams: + if parent_stream_name in self._active_stream_names: + self._logger.debug(f"Removing '{parent_stream_name}' from active streams") + self._active_stream_names.discard(parent_stream_name) + if self._stream_block_simultaneous_read.get(parent_stream_name, False): + self._logger.info( + f"Parent stream '{parent_stream_name}' deactivated after " + f"partition generation completed for child '{stream_name}'. " + f"Blocked streams in the queue will be retried on next start_next_partition_generator call." + ) + # It is possible for the stream to already be done if no partitions were generated # If the partition generation process was completed and there are no partitions left to process, the stream is done if ( @@ -181,24 +213,81 @@ def _flag_exception(self, stream_name: str, exception: Exception) -> None: def start_next_partition_generator(self) -> Optional[AirbyteMessage]: """ - Start the next partition generator. - 1. Pop the next stream to read from - 2. Submit the partition generator to the thread pool manager - 3. Add the stream to the list of streams currently generating partitions - 4. Return a stream status message + Submits the next partition generator to the thread pool. + + A stream will be deferred (moved to end of queue) if: + 1. The stream itself has block_simultaneous_read=True AND is already active + 2. Any parent stream has block_simultaneous_read=True AND is currently active + + This prevents simultaneous reads of streams that shouldn't be accessed concurrently. + + :return: A status message if a partition generator was started, otherwise None """ - if self._stream_instances_to_start_partition_generation: + if not self._stream_instances_to_start_partition_generation: + return None + + # Remember initial queue size to avoid infinite loops if all streams are blocked + max_attempts = len(self._stream_instances_to_start_partition_generation) + attempts = 0 + + while self._stream_instances_to_start_partition_generation and attempts < max_attempts: + attempts += 1 + + # Pop the first stream from the queue stream = self._stream_instances_to_start_partition_generation.pop(0) + stream_name = stream.name + + # Check if this stream has block_simultaneous_read and is already active + if self._stream_block_simultaneous_read.get(stream_name, False) and stream_name in self._active_stream_names: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' because it's already active " + f"(block_simultaneous_read=True). Trying next stream." + ) + continue # Try the next stream in the queue + + # Check if any parent streams have block_simultaneous_read and are currently active + parent_streams = self._collect_all_parent_stream_names(stream_name) + blocked_by_parents = [ + p for p in parent_streams + if self._stream_block_simultaneous_read.get(p, False) and p in self._active_stream_names + ] + + if blocked_by_parents: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' because parent stream(s) " + f"{blocked_by_parents} are active and have block_simultaneous_read=True. Trying next stream." + ) + continue # Try the next stream in the queue + + # No blocking - start this stream + # Mark stream as active before starting + self._active_stream_names.add(stream_name) + self._streams_currently_generating_partitions.append(stream_name) + + # Also mark all parent streams as active (they will be read from during partition generation) + parent_streams = self._collect_all_parent_stream_names(stream_name) + for parent_stream_name in parent_streams: + if self._stream_block_simultaneous_read.get(parent_stream_name, False): + self._active_stream_names.add(parent_stream_name) + self._logger.info( + f"Marking parent stream '{parent_stream_name}' as active " + f"(will be read during partition generation for '{stream_name}')" + ) + self._thread_pool_manager.submit(self._partition_enqueuer.generate_partitions, stream) - self._streams_currently_generating_partitions.append(stream.name) - self._logger.info(f"Marking stream {stream.name} as STARTED") - self._logger.info(f"Syncing stream: {stream.name} ") + self._logger.info(f"Marking stream {stream_name} as STARTED") + self._logger.info(f"Syncing stream: {stream_name}") return stream_status_as_airbyte_message( stream.as_airbyte_stream(), AirbyteStreamStatus.STARTED, ) - else: - return None + + # All streams in the queue are currently blocked + return None def is_done(self) -> bool: """ @@ -230,6 +319,43 @@ def is_done(self) -> bool: def _is_stream_done(self, stream_name: str) -> bool: return stream_name in self._streams_done + def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: + """ + Recursively collect all parent stream names for a given stream. + For example, if we have: epics -> issues -> comments + Then for comments, this returns {issues, epics} + + :param stream_name: The stream to collect parents for + :return: Set of all parent stream names (recursively) + """ + parent_names: Set[str] = set() + stream = self._stream_name_to_instance.get(stream_name) + + if not stream: + return parent_names + + # Get partition router if it exists (this is where parent streams are defined) + partition_router = None + + # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) + if hasattr(stream, "_stream_partition_generator") and hasattr(stream._stream_partition_generator, "_stream_slicer") and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router"): + partition_router = stream._stream_partition_generator._stream_slicer._partition_router + # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks + elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): + partition_router = stream.retriever.partition_router + + # SubstreamPartitionRouter has parent_stream_configs + if partition_router and hasattr(partition_router, "parent_stream_configs"): + for parent_config in partition_router.parent_stream_configs: + parent_stream = parent_config.stream + parent_name = parent_stream.name + parent_names.add(parent_name) + + # Recursively collect grandparents, great-grandparents, etc. + parent_names.update(self._collect_all_parent_stream_names(parent_name)) + + return parent_names + def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: self._logger.info( f"Read {self._record_counter[stream_name]} records from {stream_name} stream" @@ -246,3 +372,12 @@ def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: else AirbyteStreamStatus.COMPLETE ) yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), stream_status) + + # Remove only this stream from active set (NOT parents) + if stream_name in self._active_stream_names: + self._active_stream_names.discard(stream_name) + if self._stream_block_simultaneous_read.get(stream_name, False): + self._logger.info( + f"Stream '{stream_name}' is no longer active. " + f"Blocked streams in the queue will be retried on next start_next_partition_generator call." + ) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py index de2d93523..241282fa5 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_source.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_source.py @@ -77,7 +77,7 @@ def __init__( slice_logger: SliceLogger = DebugSliceLogger(), queue: Optional[Queue[QueueItem]] = None, message_repository: MessageRepository = InMemoryMessageRepository(), - initial_number_partitions_to_generate: int = 1, + initial_number_partitions_to_generate: int = 10, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, ) -> None: """ @@ -92,7 +92,7 @@ def __init__( self._logger = logger self._slice_logger = slice_logger self._message_repository = message_repository - self._initial_number_partitions_to_generate = initial_number_partitions_to_generate + self._initial_number_partitions_to_generate = 10 self._timeout_seconds = timeout_seconds # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 84aaa6c53..27338db92 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1560,6 +1560,17 @@ definitions: default: "" example: - "Users" + block_simultaneous_read: + title: Block Simultaneous Read + description: > + When true, prevents simultaneous reading of this stream from multiple contexts + (e.g., as both a parent stream and a standalone stream). If the stream OR any + of its parent streams are currently active, this stream will be deferred until + they finish. This is useful for APIs that don't allow concurrent access to the + same endpoint. Default is false for backward compatibility. + Only applies to ConcurrentDeclarativeSource. + type: boolean + default: false retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 5d2f0521f..78a69c4f2 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -2509,6 +2509,11 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") + block_simultaneous_read: Optional[bool] = Field( + False, + description="When true, prevents simultaneous reading of this stream from multiple contexts (e.g., as both a parent stream and a standalone stream). If the stream OR any of its parent streams are currently active, this stream will be deferred until they finish. This is useful for APIs that don't allow concurrent access to the same endpoint. Default is false for backward compatibility. Only applies to ConcurrentDeclarativeSource.\n", + title="Block Simultaneous Read", + ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages.", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 2bd7d268d..3a0f0c296 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2139,6 +2139,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), + block_simultaneous_read=model.block_simultaneous_read or False, ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 667d088ab..0052eafa3 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -85,6 +85,17 @@ def cursor(self) -> Cursor: :return: The cursor associated with this stream. """ + @property + def block_simultaneous_read(self) -> bool: + """ + Override to return True if this stream should block simultaneous reads. + When True, prevents starting partition generation for this stream if it + OR any of its parent streams are already active. + + :return: True if simultaneous reads should be blocked, False otherwise + """ + return False # Default: allow concurrent reading + @abstractmethod def check_availability(self) -> StreamAvailability: """ diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index f5d4ccf2e..84d955105 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -26,6 +26,7 @@ def __init__( cursor: Cursor, namespace: Optional[str] = None, supports_file_transfer: bool = False, + block_simultaneous_read: bool = False, ) -> None: self._stream_partition_generator = partition_generator self._name = name @@ -36,6 +37,7 @@ def __init__( self._cursor = cursor self._namespace = namespace self._supports_file_transfer = supports_file_transfer + self._block_simultaneous_read = block_simultaneous_read def generate_partitions(self) -> Iterable[Partition]: yield from self._stream_partition_generator.generate() @@ -94,6 +96,11 @@ def log_stream_sync_configuration(self) -> None: def cursor(self) -> Cursor: return self._cursor + @property + def block_simultaneous_read(self) -> bool: + """Returns whether this stream should block simultaneous reads""" + return self._block_simultaneous_read + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index a5da89e4b..fcb71c8c6 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5281,6 +5281,144 @@ def test_catalog_defined_cursor_field_stream_missing(): assert stream._cursor_field.supports_catalog_defined_cursor_field == True +def test_block_simultaneous_read_from_manifest(): + """Test that block_simultaneous_read flows through from manifest to DefaultStream""" + content = """ + parent_stream: + type: DeclarativeStream + name: "parent" + primary_key: "id" + block_simultaneous_read: true + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/parent" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + child_stream: + type: DeclarativeStream + name: "child" + primary_key: "id" + block_simultaneous_read: true + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/child" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + partition_router: + type: SubstreamPartitionRouter + parent_stream_configs: + - type: ParentStreamConfig + stream: "#/parent_stream" + parent_key: "id" + partition_field: "parent_id" + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + parent_id: + type: string + + no_block_stream: + type: DeclarativeStream + name: "no_block" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/no_block" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + """ + + config = {"api_key": "test_key"} + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + + # Test parent stream with block_simultaneous_read: true + parent_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["parent_stream"], {} + ) + parent_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config + ) + + assert isinstance(parent_stream, DefaultStream) + assert parent_stream.name == "parent" + assert parent_stream.block_simultaneous_read is True + + # Test child stream with block_simultaneous_read: true + child_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["child_stream"], {} + ) + child_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config + ) + + assert isinstance(child_stream, DefaultStream) + assert child_stream.name == "child" + assert child_stream.block_simultaneous_read is True + + # Test stream without block_simultaneous_read (should default to False) + no_block_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["no_block_stream"], {} + ) + no_block_stream: DefaultStream = factory.create_component( + model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config + ) + + assert isinstance(no_block_stream, DefaultStream) + assert no_block_stream.name == "no_block" + assert no_block_stream.block_simultaneous_read is False + + def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index a681f75eb..f1fad0799 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -792,3 +792,472 @@ def test_start_next_partition_generator(self): self._thread_pool_manager.submit.assert_called_with( self._partition_enqueuer.generate_partitions, self._stream ) + + +class TestBlockSimultaneousRead(unittest.TestCase): + """Tests for block_simultaneous_read functionality""" + + def setUp(self): + self._partition_enqueuer = Mock(spec=PartitionEnqueuer) + self._thread_pool_manager = Mock(spec=ThreadPoolManager) + self._logger = Mock(spec=logging.Logger) + self._slice_logger = Mock(spec=SliceLogger) + self._message_repository = Mock(spec=MessageRepository) + self._message_repository.consume_queue.return_value = [] + self._partition_reader = Mock(spec=PartitionReader) + + def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): + """Helper to create a mock stream""" + stream = Mock(spec=AbstractStream) + stream.name = name + stream.block_simultaneous_read = block_simultaneous_read + stream.as_airbyte_stream.return_value = AirbyteStream( + name=name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + stream.cursor.ensure_at_least_one_state_emitted = Mock() + return stream + + def _create_mock_stream_with_parent( + self, name: str, parent_stream, block_simultaneous_read: bool = False + ): + """Helper to create a mock stream with a parent stream""" + stream = self._create_mock_stream(name, block_simultaneous_read) + + # Mock the retriever and partition router for parent relationship + mock_retriever = Mock() + mock_partition_router = Mock() + mock_parent_config = Mock() + mock_parent_config.stream = parent_stream + + mock_partition_router.parent_stream_configs = [mock_parent_config] + mock_retriever.partition_router = mock_partition_router + stream.retriever = mock_retriever + + return stream + + def test_defer_stream_when_self_active(self): + """Test that a stream is deferred when it's already active""" + stream = self._create_mock_stream("stream1", block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream as active + handler._active_stream_names.add("stream1") + + # Try to start the stream again + result = handler.start_next_partition_generator() + + # Should return None (no stream started) + assert result is None + + # Stream should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == stream + + # Logger should have been called to log deferral + assert any( + "Deferring stream 'stream1' because it's already active" in str(call) + for call in self._logger.info.call_args_list + ) + + def test_defer_stream_when_parent_active(self): + """Test that a stream is deferred when its parent is active""" + parent_stream = self._create_mock_stream("parent", block_simultaneous_read=True) + child_stream = self._create_mock_stream_with_parent( + "child", parent_stream, block_simultaneous_read=True + ) + + handler = ConcurrentReadProcessor( + [parent_stream, child_stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active + handler._active_stream_names.add("parent") + + # Remove parent from queue (simulate it's already started) + handler._stream_instances_to_start_partition_generation = [child_stream] + + # Try to start child + result = handler.start_next_partition_generator() + + # Should return None (child deferred) + assert result is None + + # Child should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == child_stream + + # Logger should have been called + assert any( + "Deferring stream 'child' because parent stream(s)" in str(call) + for call in self._logger.info.call_args_list + ) + + def test_defer_stream_when_grandparent_active(self): + """Test that a stream is deferred when its grandparent is active""" + grandparent = self._create_mock_stream("grandparent", block_simultaneous_read=True) + parent = self._create_mock_stream_with_parent( + "parent", grandparent, block_simultaneous_read=True + ) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [grandparent, parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark grandparent as active + handler._active_stream_names.add("grandparent") + + # Only child in queue + handler._stream_instances_to_start_partition_generation = [child] + + # Try to start child + result = handler.start_next_partition_generator() + + # Should return None (child deferred because grandparent is active) + assert result is None + + # Child should be back in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + def test_retry_blocked_stream_after_blocker_done(self): + """Test that blocked stream is retried after blocker finishes""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream2 = self._create_mock_stream("stream2", block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [stream1, stream2], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start stream1 + handler.start_next_partition_generator() + assert "stream1" in handler._active_stream_names + + # Try to start stream2 (should be deferred since stream1 is active and they share block flag) + # But wait - they're not parent-child, so stream2 should start successfully + # Let me fix the test logic + + # Mark stream1 as active to simulate it's running + handler._active_stream_names.add("stream1") + handler._stream_instances_to_start_partition_generation = [stream1, stream2] + + # Try to start stream1 again (should be deferred because already active) + result = handler.start_next_partition_generator() + + # Should start stream2 instead (stream1 was deferred) + assert result is not None + assert "stream2" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + def test_retry_blocked_stream_after_partition_generation(self): + """Test that blocked stream is retried after partition generation completes""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Mark parent as generating partitions and having no partitions + handler._streams_currently_generating_partitions.append("parent") + handler._streams_to_running_partitions["parent"] = set() + + # Complete partition generation for parent (parent has no partitions, so it's done) + sentinel = PartitionGenerationCompletedSentinel(parent) + messages = list(handler.on_partition_generation_completed(sentinel)) + + # Child should have been started automatically by on_partition_generation_completed + # (it calls start_next_partition_generator internally) + assert "child" in handler._active_stream_names + + # Parent should be RE-ACTIVATED because child needs to read from it during partition generation + # This is the correct behavior - prevents simultaneous reads of parent + assert "parent" in handler._active_stream_names + + # Verify the queue is now empty (both streams were started) + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_blocked_stream_added_to_end_of_queue(self): + """Test that blocked streams are added to the end of the queue""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream2 = self._create_mock_stream("stream2", block_simultaneous_read=False) + stream3 = self._create_mock_stream("stream3", block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [stream1, stream2, stream3], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream1 as active + handler._active_stream_names.add("stream1") + + # Try to start streams in order: stream1, stream2, stream3 + result1 = handler.start_next_partition_generator() + + # stream1 should be deferred, stream2 should start + assert result1 is not None + assert "stream2" in handler._active_stream_names + + # Queue should now be [stream3, stream1] (stream1 moved to end) + assert len(handler._stream_instances_to_start_partition_generation) == 2 + assert handler._stream_instances_to_start_partition_generation[0] == stream3 + assert handler._stream_instances_to_start_partition_generation[1] == stream1 + + def test_no_defer_when_flag_false(self): + """Test that blocking doesn't occur when block_simultaneous_read=False""" + stream = self._create_mock_stream("stream1", block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [stream], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark stream as active + handler._active_stream_names.add("stream1") + + # Try to start the stream again (should succeed because flag is False) + result = handler.start_next_partition_generator() + + # Should return a status message (stream started) + assert result is not None + assert isinstance(result, AirbyteMessage) + + # Queue should be empty + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_collect_parent_streams_multi_level(self): + """Test that _collect_all_parent_stream_names works recursively""" + grandparent = self._create_mock_stream("grandparent") + parent = self._create_mock_stream_with_parent("parent", grandparent) + child = self._create_mock_stream_with_parent("child", parent) + + handler = ConcurrentReadProcessor( + [grandparent, parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Collect parents for child + parents = handler._collect_all_parent_stream_names("child") + + # Should include both parent and grandparent + assert "parent" in parents + assert "grandparent" in parents + assert len(parents) == 2 + + def test_deactivate_parents_when_partition_generation_completes(self): + """Test that parent streams are deactivated when partition generation completes""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Manually mark both as active (simulating partition generation for child) + handler._active_stream_names.add("parent") + handler._active_stream_names.add("child") + handler._streams_currently_generating_partitions.append("child") + + # Ensure child has running partitions (so it doesn't trigger _on_stream_is_done) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "child" + handler._streams_to_running_partitions["child"] = {mock_partition} + + # Remove both streams from the queue so start_next_partition_generator doesn't start them + # This simulates the scenario where both streams have already been started + handler._stream_instances_to_start_partition_generation = [] + + # Complete partition generation for child + sentinel = PartitionGenerationCompletedSentinel(child) + + list(handler.on_partition_generation_completed(sentinel)) + + # Parent should be deactivated (it was only needed for partition generation) + assert "parent" not in handler._active_stream_names + + # Child should still be active (it's reading records) + assert "child" in handler._active_stream_names + + def test_deactivate_only_stream_when_done(self): + """Test that only the stream itself is deactivated when done, not parents""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark both as active + handler._active_stream_names.add("parent") + handler._active_stream_names.add("child") + + # Start child and mark it as done + handler._stream_instances_to_start_partition_generation = [] + handler._streams_currently_generating_partitions = [] + handler._streams_to_running_partitions["child"] = set() + + # Call _on_stream_is_done for child + list(handler._on_stream_is_done("child")) + + # Child should be deactivated + assert "child" not in handler._active_stream_names + + # Parent should still be active (not deactivated) + assert "parent" in handler._active_stream_names + + def test_multiple_blocked_streams_retry_in_order(self): + """Test that multiple blocked streams are retried in order""" + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child1 = self._create_mock_stream_with_parent( + "child1", parent, block_simultaneous_read=True + ) + child2 = self._create_mock_stream_with_parent( + "child2", parent, block_simultaneous_read=True + ) + + handler = ConcurrentReadProcessor( + [parent, child1, child2], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + result = handler.start_next_partition_generator() + assert result is not None + assert "parent" in handler._active_stream_names + + # Try to start child1 (should be deferred) + result = handler.start_next_partition_generator() + # child1 is deferred, but child2 might start if it's not blocked + # Let me check the queue state + + # Both children should be deferred (parent is active) + assert len(handler._stream_instances_to_start_partition_generation) >= 1 + + def test_child_without_flag_blocked_by_parent_with_flag(self): + """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" + # Parent has the flag, child does NOT + parent = self._create_mock_stream("parent", block_simultaneous_read=True) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=False) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active and already started (remove from queue) + handler._active_stream_names.add("parent") + handler._stream_instances_to_start_partition_generation.remove(parent) + + # Try to start child (should be deferred even though child doesn't have the flag) + result = handler.start_next_partition_generator() + + # Child should be deferred because parent has block_simultaneous_read=True and is active + assert result is None # No stream started + assert "child" not in handler._active_stream_names + # Child should be moved to end of queue (still 1 stream in queue) + assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert handler._stream_instances_to_start_partition_generation[0] == child + + def test_child_with_flag_not_blocked_by_parent_without_flag(self): + """Test that a child WITH block_simultaneous_read is NOT blocked by parent WITHOUT the flag""" + # Parent does NOT have the flag, child does + parent = self._create_mock_stream("parent", block_simultaneous_read=False) + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Mark parent as active and already started (remove from queue) + handler._active_stream_names.add("parent") + handler._stream_instances_to_start_partition_generation.remove(parent) + + # Try to start child (should succeed even though parent is active) + result = handler.start_next_partition_generator() + + # Child should start successfully because parent doesn't have block_simultaneous_read + assert result is not None # Stream started + assert "child" in handler._active_stream_names + # Queue should now be empty (both streams started) + assert len(handler._stream_instances_to_start_partition_generation) == 0 From 0b8561e4f6588451071887561ee4d6e65d8a4c9c Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 6 Jan 2026 17:16:31 +0200 Subject: [PATCH 02/31] Change `block_simultaneous_read` to string --- .../concurrent_read_processor.py | 92 ++++++++++--- .../concurrent_source/concurrent_source.py | 4 +- .../declarative_component_schema.yaml | 25 ++-- .../models/declarative_component_schema.py | 8 +- .../parsers/model_to_component_factory.py | 2 +- .../streams/concurrent/abstract_stream.py | 16 ++- .../streams/concurrent/default_stream.py | 6 +- .../test_model_to_component_factory.py | 14 +- .../test_concurrent_read_processor.py | 127 ++++++++++++------ 9 files changed, 201 insertions(+), 93 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 6873130c1..1901d0d0d 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -70,16 +70,21 @@ def __init__( # A stream is "active" if it's generating partitions or has partitions being read self._active_stream_names: Set[str] = set() - # Store which streams require blocking simultaneous reads - self._stream_block_simultaneous_read: Dict[str, bool] = { + # Store blocking group names for streams that require blocking simultaneous reads + # Maps stream name -> group name (empty string means no blocking) + self._stream_block_simultaneous_read: Dict[str, str] = { stream.name: stream.block_simultaneous_read for stream in stream_instances_to_read_from } + # Track which groups are currently active + # Maps group name -> set of stream names in that group + self._active_groups: Dict[str, Set[str]] = {} + for stream in stream_instances_to_read_from: if stream.block_simultaneous_read: self._logger.info( - f"Stream '{stream.name}' has block_simultaneous_read=True. " - f"Will defer starting this stream if it or its parents are active." + f"Stream '{stream.name}' is in blocking group '{stream.block_simultaneous_read}'. " + f"Will defer starting this stream if another stream in the same group or its parents are active." ) def on_partition_generation_completed( @@ -102,9 +107,16 @@ def on_partition_generation_completed( if parent_stream_name in self._active_stream_names: self._logger.debug(f"Removing '{parent_stream_name}' from active streams") self._active_stream_names.discard(parent_stream_name) - if self._stream_block_simultaneous_read.get(parent_stream_name, False): + + # Remove from active groups + parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") + if parent_group: + if parent_group in self._active_groups: + self._active_groups[parent_group].discard(parent_stream_name) + if not self._active_groups[parent_group]: + del self._active_groups[parent_group] self._logger.info( - f"Parent stream '{parent_stream_name}' deactivated after " + f"Parent stream '{parent_stream_name}' (group '{parent_group}') deactivated after " f"partition generation completed for child '{stream_name}'. " f"Blocked streams in the queue will be retried on next start_next_partition_generator call." ) @@ -236,30 +248,50 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: # Pop the first stream from the queue stream = self._stream_instances_to_start_partition_generation.pop(0) stream_name = stream.name + stream_group = self._stream_block_simultaneous_read.get(stream_name, "") + + # Check if this stream has a blocking group and is already active + if stream_group and stream_name in self._active_stream_names: + # Add back to the END of the queue for retry later + self._stream_instances_to_start_partition_generation.append(stream) + self._logger.info( + f"Deferring stream '{stream_name}' (group '{stream_group}') because it's already active. Trying next stream." + ) + continue # Try the next stream in the queue - # Check if this stream has block_simultaneous_read and is already active - if self._stream_block_simultaneous_read.get(stream_name, False) and stream_name in self._active_stream_names: + # Check if this stream's group is already active (another stream in the same group is running) + if ( + stream_group + and stream_group in self._active_groups + and self._active_groups[stream_group] + ): # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) + active_streams_in_group = self._active_groups[stream_group] self._logger.info( - f"Deferring stream '{stream_name}' because it's already active " - f"(block_simultaneous_read=True). Trying next stream." + f"Deferring stream '{stream_name}' (group '{stream_group}') because other stream(s) " + f"{active_streams_in_group} in the same group are active. Trying next stream." ) continue # Try the next stream in the queue - # Check if any parent streams have block_simultaneous_read and are currently active + # Check if any parent streams have a blocking group and are currently active parent_streams = self._collect_all_parent_stream_names(stream_name) blocked_by_parents = [ - p for p in parent_streams - if self._stream_block_simultaneous_read.get(p, False) and p in self._active_stream_names + p + for p in parent_streams + if self._stream_block_simultaneous_read.get(p, "") + and p in self._active_stream_names ] if blocked_by_parents: # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) + parent_groups = { + self._stream_block_simultaneous_read.get(p, "") for p in blocked_by_parents + } self._logger.info( f"Deferring stream '{stream_name}' because parent stream(s) " - f"{blocked_by_parents} are active and have block_simultaneous_read=True. Trying next stream." + f"{blocked_by_parents} (groups {parent_groups}) are active. Trying next stream." ) continue # Try the next stream in the queue @@ -268,13 +300,24 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: self._active_stream_names.add(stream_name) self._streams_currently_generating_partitions.append(stream_name) + # Track this stream in its group if it has one + if stream_group: + if stream_group not in self._active_groups: + self._active_groups[stream_group] = set() + self._active_groups[stream_group].add(stream_name) + self._logger.debug(f"Added '{stream_name}' to active group '{stream_group}'") + # Also mark all parent streams as active (they will be read from during partition generation) parent_streams = self._collect_all_parent_stream_names(stream_name) for parent_stream_name in parent_streams: - if self._stream_block_simultaneous_read.get(parent_stream_name, False): + parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") + if parent_group: self._active_stream_names.add(parent_stream_name) + if parent_group not in self._active_groups: + self._active_groups[parent_group] = set() + self._active_groups[parent_group].add(parent_stream_name) self._logger.info( - f"Marking parent stream '{parent_stream_name}' as active " + f"Marking parent stream '{parent_stream_name}' (group '{parent_group}') as active " f"(will be read during partition generation for '{stream_name}')" ) @@ -338,7 +381,11 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: partition_router = None # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) - if hasattr(stream, "_stream_partition_generator") and hasattr(stream._stream_partition_generator, "_stream_slicer") and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router"): + if ( + hasattr(stream, "_stream_partition_generator") + and hasattr(stream._stream_partition_generator, "_stream_slicer") + and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router") + ): partition_router = stream._stream_partition_generator._stream_slicer._partition_router # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): @@ -376,8 +423,15 @@ def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]: # Remove only this stream from active set (NOT parents) if stream_name in self._active_stream_names: self._active_stream_names.discard(stream_name) - if self._stream_block_simultaneous_read.get(stream_name, False): + + # Remove from active groups + stream_group = self._stream_block_simultaneous_read.get(stream_name, "") + if stream_group: + if stream_group in self._active_groups: + self._active_groups[stream_group].discard(stream_name) + if not self._active_groups[stream_group]: + del self._active_groups[stream_group] self._logger.info( - f"Stream '{stream_name}' is no longer active. " + f"Stream '{stream_name}' (group '{stream_group}') is no longer active. " f"Blocked streams in the queue will be retried on next start_next_partition_generator call." ) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py index 241282fa5..de2d93523 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_source.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_source.py @@ -77,7 +77,7 @@ def __init__( slice_logger: SliceLogger = DebugSliceLogger(), queue: Optional[Queue[QueueItem]] = None, message_repository: MessageRepository = InMemoryMessageRepository(), - initial_number_partitions_to_generate: int = 10, + initial_number_partitions_to_generate: int = 1, timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS, ) -> None: """ @@ -92,7 +92,7 @@ def __init__( self._logger = logger self._slice_logger = slice_logger self._message_repository = message_repository - self._initial_number_partitions_to_generate = 10 + self._initial_number_partitions_to_generate = initial_number_partitions_to_generate self._timeout_seconds = timeout_seconds # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 27338db92..f1fd95b62 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1563,14 +1563,23 @@ definitions: block_simultaneous_read: title: Block Simultaneous Read description: > - When true, prevents simultaneous reading of this stream from multiple contexts - (e.g., as both a parent stream and a standalone stream). If the stream OR any - of its parent streams are currently active, this stream will be deferred until - they finish. This is useful for APIs that don't allow concurrent access to the - same endpoint. Default is false for backward compatibility. - Only applies to ConcurrentDeclarativeSource. - type: boolean - default: false + Optional group name for blocking simultaneous reads. Streams with the same + block_simultaneous_read value will not be read concurrently. This prevents + duplicate API calls when a stream is used as both a standalone stream and a + parent stream, or when multiple streams share the same endpoint/session. + + If set to a non-empty string, the stream will be deferred if: + 1. Another stream in the same group is currently active + 2. Any parent stream is in an active group + + Examples: + - "issues_endpoint" - All streams with this value block each other + - "" or null - No blocking (default) + + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: string + default: "" retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 78a69c4f2..56a32fb5e 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -1,5 +1,3 @@ -# Copyright (c) 2025 Airbyte, Inc., all rights reserved. - # generated by datamodel-codegen: # filename: declarative_component_schema.yaml @@ -2509,9 +2507,9 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") - block_simultaneous_read: Optional[bool] = Field( - False, - description="When true, prevents simultaneous reading of this stream from multiple contexts (e.g., as both a parent stream and a standalone stream). If the stream OR any of its parent streams are currently active, this stream will be deferred until they finish. This is useful for APIs that don't allow concurrent access to the same endpoint. Default is false for backward compatibility. Only applies to ConcurrentDeclarativeSource.\n", + block_simultaneous_read: Optional[str] = Field( + "", + description='Optional group name for blocking simultaneous reads. Streams with the same block_simultaneous_read value will not be read concurrently. This prevents duplicate API calls when a stream is used as both a standalone stream and a parent stream, or when multiple streams share the same endpoint/session.\nIf set to a non-empty string, the stream will be deferred if: 1. Another stream in the same group is currently active 2. Any parent stream is in an active group\nExamples: - "issues_endpoint" - All streams with this value block each other - "" or null - No blocking (default)\nThis is useful for APIs that don\'t allow concurrent access to the same endpoint or session. Only applies to ConcurrentDeclarativeSource.\n', title="Block Simultaneous Read", ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3a0f0c296..2de844580 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2139,7 +2139,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=model.block_simultaneous_read or False, + block_simultaneous_read=model.block_simultaneous_read or "", ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py index 0052eafa3..e7b24f614 100644 --- a/airbyte_cdk/sources/streams/concurrent/abstract_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/abstract_stream.py @@ -86,15 +86,19 @@ def cursor(self) -> Cursor: """ @property - def block_simultaneous_read(self) -> bool: + def block_simultaneous_read(self) -> str: """ - Override to return True if this stream should block simultaneous reads. - When True, prevents starting partition generation for this stream if it - OR any of its parent streams are already active. + Override to return a non-empty group name if this stream should block simultaneous reads. + When a non-empty string is returned, prevents starting partition generation for this stream if: + - Another stream with the same group name is already active + - Any of its parent streams are in an active group - :return: True if simultaneous reads should be blocked, False otherwise + This allows grouping multiple streams that share the same resource (e.g., API endpoint or session) + to prevent them from running concurrently, even if they don't have a parent-child relationship. + + :return: Group name for blocking (non-empty string), or "" to allow concurrent reading """ - return False # Default: allow concurrent reading + return "" # Default: allow concurrent reading @abstractmethod def check_availability(self) -> StreamAvailability: diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 84d955105..5ca11eaf8 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -26,7 +26,7 @@ def __init__( cursor: Cursor, namespace: Optional[str] = None, supports_file_transfer: bool = False, - block_simultaneous_read: bool = False, + block_simultaneous_read: str = "", ) -> None: self._stream_partition_generator = partition_generator self._name = name @@ -97,8 +97,8 @@ def cursor(self) -> Cursor: return self._cursor @property - def block_simultaneous_read(self) -> bool: - """Returns whether this stream should block simultaneous reads""" + def block_simultaneous_read(self) -> str: + """Returns the blocking group name for this stream, or empty string if no blocking""" return self._block_simultaneous_read def check_availability(self) -> StreamAvailability: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index fcb71c8c6..bc65f578e 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5288,7 +5288,7 @@ def test_block_simultaneous_read_from_manifest(): type: DeclarativeStream name: "parent" primary_key: "id" - block_simultaneous_read: true + block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5316,7 +5316,7 @@ def test_block_simultaneous_read_from_manifest(): type: DeclarativeStream name: "child" primary_key: "id" - block_simultaneous_read: true + block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5392,9 +5392,9 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(parent_stream, DefaultStream) assert parent_stream.name == "parent" - assert parent_stream.block_simultaneous_read is True + assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read: true + # Test child stream with block_simultaneous_read: "issues_endpoint" child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) @@ -5404,9 +5404,9 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(child_stream, DefaultStream) assert child_stream.name == "child" - assert child_stream.block_simultaneous_read is True + assert child_stream.block_simultaneous_read == "issues_endpoint" - # Test stream without block_simultaneous_read (should default to False) + # Test stream without block_simultaneous_read (should default to empty string) no_block_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["no_block_stream"], {} ) @@ -5416,7 +5416,7 @@ def test_block_simultaneous_read_from_manifest(): assert isinstance(no_block_stream, DefaultStream) assert no_block_stream.name == "no_block" - assert no_block_stream.block_simultaneous_read is False + assert no_block_stream.block_simultaneous_read == "" def get_schema_loader(stream: DefaultStream): diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index f1fad0799..e76f0576c 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -806,7 +806,7 @@ def setUp(self): self._message_repository.consume_queue.return_value = [] self._partition_reader = Mock(spec=PartitionReader) - def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): + def _create_mock_stream(self, name: str, block_simultaneous_read: str = ""): """Helper to create a mock stream""" stream = Mock(spec=AbstractStream) stream.name = name @@ -820,7 +820,7 @@ def _create_mock_stream(self, name: str, block_simultaneous_read: bool = False): return stream def _create_mock_stream_with_parent( - self, name: str, parent_stream, block_simultaneous_read: bool = False + self, name: str, parent_stream, block_simultaneous_read: str = "" ): """Helper to create a mock stream with a parent stream""" stream = self._create_mock_stream(name, block_simultaneous_read) @@ -839,7 +839,7 @@ def _create_mock_stream_with_parent( def test_defer_stream_when_self_active(self): """Test that a stream is deferred when it's already active""" - stream = self._create_mock_stream("stream1", block_simultaneous_read=True) + stream = self._create_mock_stream("stream1", block_simultaneous_read="api_group") handler = ConcurrentReadProcessor( [stream], @@ -866,15 +866,16 @@ def test_defer_stream_when_self_active(self): # Logger should have been called to log deferral assert any( - "Deferring stream 'stream1' because it's already active" in str(call) + "Deferring stream 'stream1' (group 'api_group') because it's already active" + in str(call) for call in self._logger.info.call_args_list ) def test_defer_stream_when_parent_active(self): """Test that a stream is deferred when its parent is active""" - parent_stream = self._create_mock_stream("parent", block_simultaneous_read=True) + parent_stream = self._create_mock_stream("parent", block_simultaneous_read="api_group") child_stream = self._create_mock_stream_with_parent( - "child", parent_stream, block_simultaneous_read=True + "child", parent_stream, block_simultaneous_read="api_group" ) handler = ConcurrentReadProcessor( @@ -911,11 +912,13 @@ def test_defer_stream_when_parent_active(self): def test_defer_stream_when_grandparent_active(self): """Test that a stream is deferred when its grandparent is active""" - grandparent = self._create_mock_stream("grandparent", block_simultaneous_read=True) + grandparent = self._create_mock_stream("grandparent", block_simultaneous_read="api_group") parent = self._create_mock_stream_with_parent( - "parent", grandparent, block_simultaneous_read=True + "parent", grandparent, block_simultaneous_read="api_group" + ) + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" ) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) handler = ConcurrentReadProcessor( [grandparent, parent, child], @@ -943,9 +946,9 @@ def test_defer_stream_when_grandparent_active(self): assert len(handler._stream_instances_to_start_partition_generation) == 1 def test_retry_blocked_stream_after_blocker_done(self): - """Test that blocked stream is retried after blocker finishes""" - stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) - stream2 = self._create_mock_stream("stream2", block_simultaneous_read=True) + """Test that independent streams with different groups don't block each other""" + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="group1") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="group2") handler = ConcurrentReadProcessor( [stream1, stream2], @@ -961,26 +964,21 @@ def test_retry_blocked_stream_after_blocker_done(self): handler.start_next_partition_generator() assert "stream1" in handler._active_stream_names - # Try to start stream2 (should be deferred since stream1 is active and they share block flag) - # But wait - they're not parent-child, so stream2 should start successfully - # Let me fix the test logic - - # Mark stream1 as active to simulate it's running - handler._active_stream_names.add("stream1") - handler._stream_instances_to_start_partition_generation = [stream1, stream2] - - # Try to start stream1 again (should be deferred because already active) + # Stream2 should start successfully even though stream1 is active + # because they're in different groups result = handler.start_next_partition_generator() - # Should start stream2 instead (stream1 was deferred) + # Should start stream2 (different group, no blocking) assert result is not None assert "stream2" in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 1 + assert len(handler._stream_instances_to_start_partition_generation) == 0 def test_retry_blocked_stream_after_partition_generation(self): """Test that blocked stream is retried after partition generation completes""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1017,9 +1015,9 @@ def test_retry_blocked_stream_after_partition_generation(self): def test_blocked_stream_added_to_end_of_queue(self): """Test that blocked streams are added to the end of the queue""" - stream1 = self._create_mock_stream("stream1", block_simultaneous_read=True) - stream2 = self._create_mock_stream("stream2", block_simultaneous_read=False) - stream3 = self._create_mock_stream("stream3", block_simultaneous_read=False) + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="api_group") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="") + stream3 = self._create_mock_stream("stream3", block_simultaneous_read="") handler = ConcurrentReadProcessor( [stream1, stream2, stream3], @@ -1047,8 +1045,8 @@ def test_blocked_stream_added_to_end_of_queue(self): assert handler._stream_instances_to_start_partition_generation[1] == stream1 def test_no_defer_when_flag_false(self): - """Test that blocking doesn't occur when block_simultaneous_read=False""" - stream = self._create_mock_stream("stream1", block_simultaneous_read=False) + """Test that blocking doesn't occur when block_simultaneous_read=""" "" + stream = self._create_mock_stream("stream1", block_simultaneous_read="") handler = ConcurrentReadProcessor( [stream], @@ -1099,8 +1097,10 @@ def test_collect_parent_streams_multi_level(self): def test_deactivate_parents_when_partition_generation_completes(self): """Test that parent streams are deactivated when partition generation completes""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1139,8 +1139,10 @@ def test_deactivate_parents_when_partition_generation_completes(self): def test_deactivate_only_stream_when_done(self): """Test that only the stream itself is deactivated when done, not parents""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1172,12 +1174,12 @@ def test_deactivate_only_stream_when_done(self): def test_multiple_blocked_streams_retry_in_order(self): """Test that multiple blocked streams are retried in order""" - parent = self._create_mock_stream("parent", block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") child1 = self._create_mock_stream_with_parent( - "child1", parent, block_simultaneous_read=True + "child1", parent, block_simultaneous_read="api_group" ) child2 = self._create_mock_stream_with_parent( - "child2", parent, block_simultaneous_read=True + "child2", parent, block_simultaneous_read="api_group" ) handler = ConcurrentReadProcessor( @@ -1206,8 +1208,8 @@ def test_multiple_blocked_streams_retry_in_order(self): def test_child_without_flag_blocked_by_parent_with_flag(self): """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" # Parent has the flag, child does NOT - parent = self._create_mock_stream("parent", block_simultaneous_read=True) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=False) + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read="") handler = ConcurrentReadProcessor( [parent, child], @@ -1226,7 +1228,7 @@ def test_child_without_flag_blocked_by_parent_with_flag(self): # Try to start child (should be deferred even though child doesn't have the flag) result = handler.start_next_partition_generator() - # Child should be deferred because parent has block_simultaneous_read=True and is active + # Child should be deferred because parent has block_simultaneous_read="api_group" and is active assert result is None # No stream started assert "child" not in handler._active_stream_names # Child should be moved to end of queue (still 1 stream in queue) @@ -1236,8 +1238,10 @@ def test_child_without_flag_blocked_by_parent_with_flag(self): def test_child_with_flag_not_blocked_by_parent_without_flag(self): """Test that a child WITH block_simultaneous_read is NOT blocked by parent WITHOUT the flag""" # Parent does NOT have the flag, child does - parent = self._create_mock_stream("parent", block_simultaneous_read=False) - child = self._create_mock_stream_with_parent("child", parent, block_simultaneous_read=True) + parent = self._create_mock_stream("parent", block_simultaneous_read="") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) handler = ConcurrentReadProcessor( [parent, child], @@ -1261,3 +1265,42 @@ def test_child_with_flag_not_blocked_by_parent_without_flag(self): assert "child" in handler._active_stream_names # Queue should now be empty (both streams started) assert len(handler._stream_instances_to_start_partition_generation) == 0 + + def test_unrelated_streams_in_same_group_block_each_other(self): + """Test that multiple unrelated streams with the same group name block each other""" + # Create three unrelated streams (no parent-child relationship) in the same group + stream1 = self._create_mock_stream("stream1", block_simultaneous_read="shared_endpoint") + stream2 = self._create_mock_stream("stream2", block_simultaneous_read="shared_endpoint") + stream3 = self._create_mock_stream("stream3", block_simultaneous_read="shared_endpoint") + + handler = ConcurrentReadProcessor( + [stream1, stream2, stream3], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start stream1 + result = handler.start_next_partition_generator() + assert result is not None + assert "stream1" in handler._active_stream_names + assert "shared_endpoint" in handler._active_groups + assert "stream1" in handler._active_groups["shared_endpoint"] + + # Try to start stream2 (should be deferred because it's in the same group) + result = handler.start_next_partition_generator() + # stream2 should be deferred, stream3 should also be deferred + # All three are in same group, only stream1 is active + assert result is None # No stream started + + # Both stream2 and stream3 should be in the queue + assert len(handler._stream_instances_to_start_partition_generation) == 2 + + # Verify logger was called with deferral message + assert any( + "Deferring stream 'stream2'" in str(call) and "shared_endpoint" in str(call) + for call in self._logger.info.call_args_list + ) From c6e89c19bbc2ace0b71e2b66bcb1f1e8a26465b5 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 6 Jan 2026 17:59:47 +0200 Subject: [PATCH 03/31] Fix StreamFacade --- airbyte_cdk/sources/streams/concurrent/adapters.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/airbyte_cdk/sources/streams/concurrent/adapters.py b/airbyte_cdk/sources/streams/concurrent/adapters.py index 41674bdae..a9bc47e0d 100644 --- a/airbyte_cdk/sources/streams/concurrent/adapters.py +++ b/airbyte_cdk/sources/streams/concurrent/adapters.py @@ -196,6 +196,11 @@ def cursor_field(self) -> Union[str, List[str]]: def cursor(self) -> Optional[Cursor]: # type: ignore[override] # StreamFaced expects to use only airbyte_cdk.sources.streams.concurrent.cursor.Cursor return self._cursor + @property + def block_simultaneous_read(self) -> str: + """Returns the blocking group name from the underlying stream""" + return self._abstract_stream.block_simultaneous_read + # FIXME the lru_cache seems to be mostly there because of typing issue @lru_cache(maxsize=None) def get_json_schema(self) -> Mapping[str, Any]: From 4868f26dbfe152c4247938569a0ad12d9e76de56 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Fri, 9 Jan 2026 18:37:35 +0200 Subject: [PATCH 04/31] Fix NoneType error when all streams are blocked --- .../sources/concurrent_source/concurrent_read_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 1901d0d0d..ec331e05d 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -129,7 +129,9 @@ def on_partition_generation_completed( ): yield from self._on_stream_is_done(stream_name) if self._stream_instances_to_start_partition_generation: - yield self.start_next_partition_generator() # type:ignore # None may be yielded + status_message = self.start_next_partition_generator() + if status_message: + yield status_message def on_partition(self, partition: Partition) -> None: """ From 6b614b3f1ff7530bef01dee0c22ee3412f106e2c Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 12 Jan 2026 23:10:50 +0200 Subject: [PATCH 05/31] Fix unit tests --- .../test_concurrent_read_processor.py | 53 ++++++++++++++++--- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index e76f0576c..7d9c52e26 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1045,7 +1045,7 @@ def test_blocked_stream_added_to_end_of_queue(self): assert handler._stream_instances_to_start_partition_generation[1] == stream1 def test_no_defer_when_flag_false(self): - """Test that blocking doesn't occur when block_simultaneous_read=""" "" + """Test that blocking doesn't occur when block_simultaneous_read="" """ stream = self._create_mock_stream("stream1", block_simultaneous_read="") handler = ConcurrentReadProcessor( @@ -1196,14 +1196,55 @@ def test_multiple_blocked_streams_retry_in_order(self): result = handler.start_next_partition_generator() assert result is not None assert "parent" in handler._active_stream_names + assert "api_group" in handler._active_groups + assert "parent" in handler._active_groups["api_group"] - # Try to start child1 (should be deferred) + # Try to start next stream (child1) - should be deferred because parent is active result = handler.start_next_partition_generator() - # child1 is deferred, but child2 might start if it's not blocked - # Let me check the queue state + assert result is None # child1 was deferred - # Both children should be deferred (parent is active) - assert len(handler._stream_instances_to_start_partition_generation) >= 1 + # After first deferral, we should still have 2 streams in queue (child1 moved to end) + assert len(handler._stream_instances_to_start_partition_generation) == 2 + # child1 was moved to the back, so the queue has the other child first + queue_streams = handler._stream_instances_to_start_partition_generation + assert child1 in queue_streams + assert child2 in queue_streams + + # Try to start next stream (child2) - should also be deferred + result = handler.start_next_partition_generator() + assert result is None # child2 was deferred + + # Both streams still in queue, but order may have changed + assert len(handler._stream_instances_to_start_partition_generation) == 2 + + # Verify neither child is active yet (both blocked by parent) + assert "child1" not in handler._active_stream_names + assert "child2" not in handler._active_stream_names + + # Verify deferral was logged for both children + logger_calls = [str(call) for call in self._logger.info.call_args_list] + assert any("Deferring stream 'child1'" in call for call in logger_calls) + assert any("Deferring stream 'child2'" in call for call in logger_calls) + + # Simulate parent completing partition generation (parent has no partitions, so it's done) + handler._streams_currently_generating_partitions.append("parent") + handler._streams_to_running_partitions["parent"] = set() + sentinel = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel)) + + # After parent completes, one of the children should start (whichever was first in queue) + # We know at least one child started because the queue shrunk + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Verify that exactly one child is now active + children_active = [ + name for name in ["child1", "child2"] + if name in handler._active_stream_names + ] + assert len(children_active) == 1, f"Expected exactly one child active, got: {children_active}" + + # Parent should be re-activated because the active child needs to read from it + assert "parent" in handler._active_stream_names def test_child_without_flag_blocked_by_parent_with_flag(self): """Test that a child WITHOUT block_simultaneous_read is blocked by parent WITH the flag""" From 46e7c19b0ad86ed8a63942e043f95ddd777d089c Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Mon, 12 Jan 2026 21:13:12 +0000 Subject: [PATCH 06/31] Auto-fix lint and format issues --- .../streams/concurrent/test_concurrent_read_processor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 7d9c52e26..d608a823f 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1238,10 +1238,11 @@ def test_multiple_blocked_streams_retry_in_order(self): # Verify that exactly one child is now active children_active = [ - name for name in ["child1", "child2"] - if name in handler._active_stream_names + name for name in ["child1", "child2"] if name in handler._active_stream_names ] - assert len(children_active) == 1, f"Expected exactly one child active, got: {children_active}" + assert len(children_active) == 1, ( + f"Expected exactly one child active, got: {children_active}" + ) # Parent should be re-activated because the active child needs to read from it assert "parent" in handler._active_stream_names From b82dfbc4d35edeb76873968dcd031cd388f47db0 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 16:53:33 +0200 Subject: [PATCH 07/31] Add retry deferred streams on stream completion --- .../concurrent_read_processor.py | 6 + .../test_concurrent_read_processor.py | 112 ++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index ec331e05d..372430be1 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -159,6 +159,7 @@ def on_partition_complete_sentinel( 1. Close the partition 2. If the stream is done, mark it as such and return a stream status message 3. Emit messages that were added to the message repository + 4. If there are more streams to read from, start the next partition generator """ partition = sentinel.partition @@ -171,6 +172,11 @@ def on_partition_complete_sentinel( and len(partitions_running) == 0 ): yield from self._on_stream_is_done(partition.stream_name()) + # Try to start the next stream in the queue (may be a deferred stream) + if self._stream_instances_to_start_partition_generation: + status_message = self.start_next_partition_generator() + if status_message: + yield status_message yield from self._message_repository.consume_queue() def on_record(self, record: Record) -> Iterable[AirbyteMessage]: diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index d608a823f..9da12fc1c 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1346,3 +1346,115 @@ def test_unrelated_streams_in_same_group_block_each_other(self): "Deferring stream 'stream2'" in str(call) and "shared_endpoint" in str(call) for call in self._logger.info.call_args_list ) + + def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): + """Test that child stream starts after parent completes via on_partition_complete_sentinel""" + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Try to start child (should be deferred) + result = handler.start_next_partition_generator() + assert result is None + assert "child" not in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Create a partition for parent and add it to running partitions + # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "parent" + handler._streams_to_running_partitions["parent"].add(mock_partition) + + # Complete partition generation for parent + sentinel_gen = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel_gen)) + + # Now complete the partition (this triggers stream done) + sentinel_complete = PartitionCompleteSentinel(mock_partition) + messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) + + # Child should have been started automatically + assert "child" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + # Verify a STARTED message was emitted for child + started_messages = [ + msg + for msg in messages + if msg.type == MessageType.TRACE + and msg.trace.stream_status + and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED + ] + assert len(started_messages) == 1 + assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" + + def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): + """Test that child stream starts after parent completes via on_partition_complete_sentinel""" + parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") + child = self._create_mock_stream_with_parent( + "child", parent, block_simultaneous_read="api_group" + ) + + handler = ConcurrentReadProcessor( + [parent, child], + self._partition_enqueuer, + self._thread_pool_manager, + self._logger, + self._slice_logger, + self._message_repository, + self._partition_reader, + ) + + # Start parent + handler.start_next_partition_generator() + assert "parent" in handler._active_stream_names + + # Try to start child (should be deferred) + result = handler.start_next_partition_generator() + assert result is None + assert "child" not in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 1 + + # Create a partition for parent and add it to running partitions + # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) + mock_partition = Mock(spec=Partition) + mock_partition.stream_name.return_value = "parent" + handler._streams_to_running_partitions["parent"].add(mock_partition) + + # Complete partition generation for parent + sentinel_gen = PartitionGenerationCompletedSentinel(parent) + list(handler.on_partition_generation_completed(sentinel_gen)) + + # Now complete the partition (this triggers stream done) + sentinel_complete = PartitionCompleteSentinel(mock_partition) + messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) + + # Child should have been started automatically + assert "child" in handler._active_stream_names + assert len(handler._stream_instances_to_start_partition_generation) == 0 + + # Verify a STARTED message was emitted for child + started_messages = [ + msg + for msg in messages + if msg.type == MessageType.TRACE + and msg.trace.stream_status + and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED + ] + assert len(started_messages) == 1 + assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" From 0c0f4ef9c81d7b8d8113e7b0717d124d307d411a Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 17:44:32 +0200 Subject: [PATCH 08/31] Fix unit tests --- .../test_concurrent_read_processor.py | 58 +------------------ 1 file changed, 1 insertion(+), 57 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 9da12fc1c..eecf74245 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -945,7 +945,7 @@ def test_defer_stream_when_grandparent_active(self): # Child should be back in the queue assert len(handler._stream_instances_to_start_partition_generation) == 1 - def test_retry_blocked_stream_after_blocker_done(self): + def test_different_groups_do_not_block_each_other(self): """Test that independent streams with different groups don't block each other""" stream1 = self._create_mock_stream("stream1", block_simultaneous_read="group1") stream2 = self._create_mock_stream("stream2", block_simultaneous_read="group2") @@ -1402,59 +1402,3 @@ def test_child_starts_after_parent_completes_via_partition_complete_sentinel(sel ] assert len(started_messages) == 1 assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" - - def test_child_starts_after_parent_completes_via_partition_complete_sentinel(self): - """Test that child stream starts after parent completes via on_partition_complete_sentinel""" - parent = self._create_mock_stream("parent", block_simultaneous_read="api_group") - child = self._create_mock_stream_with_parent( - "child", parent, block_simultaneous_read="api_group" - ) - - handler = ConcurrentReadProcessor( - [parent, child], - self._partition_enqueuer, - self._thread_pool_manager, - self._logger, - self._slice_logger, - self._message_repository, - self._partition_reader, - ) - - # Start parent - handler.start_next_partition_generator() - assert "parent" in handler._active_stream_names - - # Try to start child (should be deferred) - result = handler.start_next_partition_generator() - assert result is None - assert "child" not in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 1 - - # Create a partition for parent and add it to running partitions - # (parent is already in _streams_currently_generating_partitions from start_next_partition_generator) - mock_partition = Mock(spec=Partition) - mock_partition.stream_name.return_value = "parent" - handler._streams_to_running_partitions["parent"].add(mock_partition) - - # Complete partition generation for parent - sentinel_gen = PartitionGenerationCompletedSentinel(parent) - list(handler.on_partition_generation_completed(sentinel_gen)) - - # Now complete the partition (this triggers stream done) - sentinel_complete = PartitionCompleteSentinel(mock_partition) - messages = list(handler.on_partition_complete_sentinel(sentinel_complete)) - - # Child should have been started automatically - assert "child" in handler._active_stream_names - assert len(handler._stream_instances_to_start_partition_generation) == 0 - - # Verify a STARTED message was emitted for child - started_messages = [ - msg - for msg in messages - if msg.type == MessageType.TRACE - and msg.trace.stream_status - and msg.trace.stream_status.status == AirbyteStreamStatus.STARTED - ] - assert len(started_messages) == 1 - assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" From cc1d47c7b11fd881a81d8aab4fb78d76fbfd7aed Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 13 Jan 2026 18:26:18 +0200 Subject: [PATCH 09/31] More fixes for unit tests --- .../streams/concurrent/test_concurrent_read_processor.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index eecf74245..788478ae6 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -994,10 +994,6 @@ def test_retry_blocked_stream_after_partition_generation(self): handler.start_next_partition_generator() assert "parent" in handler._active_stream_names - # Mark parent as generating partitions and having no partitions - handler._streams_currently_generating_partitions.append("parent") - handler._streams_to_running_partitions["parent"] = set() - # Complete partition generation for parent (parent has no partitions, so it's done) sentinel = PartitionGenerationCompletedSentinel(parent) messages = list(handler.on_partition_generation_completed(sentinel)) @@ -1227,8 +1223,6 @@ def test_multiple_blocked_streams_retry_in_order(self): assert any("Deferring stream 'child2'" in call for call in logger_calls) # Simulate parent completing partition generation (parent has no partitions, so it's done) - handler._streams_currently_generating_partitions.append("parent") - handler._streams_to_running_partitions["parent"] = set() sentinel = PartitionGenerationCompletedSentinel(parent) list(handler.on_partition_generation_completed(sentinel)) From 6f5b4ea72874c51a62894fd9225c4dd17f6af06b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:48:05 +0000 Subject: [PATCH 10/31] refactor: replace per-stream block_simultaneous_read with top-level stream_groups - Remove block_simultaneous_read property from DeclarativeStream schema - Add top-level stream_groups with StreamGroup and BlockSimultaneousSyncsAction - ConcurrentDeclarativeSource parses stream_groups and injects block_simultaneous_read into stream configs before factory processing - Internal blocking logic in ConcurrentReadProcessor unchanged - Update tests for new interface Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 42 ++++++++++- .../declarative_component_schema.yaml | 66 +++++++++++------ .../models/declarative_component_schema.py | 32 +++++++-- .../parsers/model_to_component_factory.py | 2 +- .../test_model_to_component_factory.py | 14 ++-- .../test_concurrent_declarative_source.py | 71 +++++++++++++++++++ 6 files changed, 196 insertions(+), 31 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 45fe6aa2d..cdd86fda9 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,6 +405,14 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) + stream_name_to_group = self._build_stream_name_to_group(self._source_config) + + prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) + for stream_config in prepared_configs: + stream_name = stream_config.get("name", "") + if stream_name in stream_name_to_group: + stream_config["block_simultaneous_read"] = stream_name_to_group[stream_name] + source_streams = [ self._constructor.create_component( ( @@ -416,7 +424,7 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i self._config, emit_connector_builder_messages=self._emit_connector_builder_messages, ) - for stream_config in self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) + for stream_config in prepared_configs ] return source_streams @@ -526,6 +534,38 @@ def dynamic_streams(self) -> List[Dict[str, Any]]: with_dynamic_stream_name=True, ) + @staticmethod + def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: + """Build a mapping from stream name to group name based on the stream_groups manifest config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + + Returns: + A dict mapping stream name -> group name for streams that belong to a group. + """ + stream_name_to_group: Dict[str, str] = {} + stream_groups = manifest.get("stream_groups", {}) + if not stream_groups: + return stream_name_to_group + + for group_name, group_config in stream_groups.items(): + streams = group_config.get("streams", []) + for stream_ref in streams: + if isinstance(stream_ref, dict): + # After reference resolution, stream_ref is a full stream definition dict + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + elif isinstance(stream_ref, str): + # If not resolved (shouldn't happen normally), extract name from ref path + # e.g., "#/definitions/my_stream" -> "my_stream" + if stream_ref.startswith("#/definitions/"): + stream_name = stream_ref.split("/")[-1] + stream_name_to_group[stream_name] = group_name + + return stream_name_to_group + def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config stream_configs = [] diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index f1fd95b62..3f10a1bf5 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -45,6 +45,15 @@ properties: "$ref": "#/definitions/ConcurrencyLevel" api_budget: "$ref": "#/definitions/HTTPAPIBudget" + stream_groups: + title: Stream Groups + description: > + Groups of streams that share a common resource and should not be read simultaneously. + Each group defines a set of stream references and an action that controls how concurrent + reads are managed. Only applies to ConcurrentDeclarativeSource. + type: object + additionalProperties: + "$ref": "#/definitions/StreamGroup" max_concurrent_async_job_count: title: Maximum Concurrent Asynchronous Jobs description: Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information. @@ -63,6 +72,43 @@ properties: description: A description of the connector. It will be presented on the Source documentation page. additionalProperties: false definitions: + StreamGroup: + title: Stream Group + description: > + A group of streams that share a common resource and should not be read simultaneously. + Streams in the same group will be blocked from concurrent reads based on the specified action. + type: object + required: + - streams + - action + properties: + streams: + title: Streams + description: > + List of references to streams that belong to this group. Use JSON references + to stream definitions (e.g., "#/definitions/my_stream"). + type: array + items: + type: string + action: + title: Action + description: The action to apply to streams in this group. + "$ref": "#/definitions/BlockSimultaneousSyncsAction" + BlockSimultaneousSyncsAction: + title: Block Simultaneous Syncs Action + description: > + Action that prevents streams in the same group from being read concurrently. + When applied to a stream group, streams with this action will be deferred if + another stream in the same group is currently active. + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: object + required: + - type + properties: + type: + type: string + enum: [BlockSimultaneousSyncsAction] AddedFieldDefinition: title: Definition Of Field To Add description: Defines the field to add on a record. @@ -1560,26 +1606,6 @@ definitions: default: "" example: - "Users" - block_simultaneous_read: - title: Block Simultaneous Read - description: > - Optional group name for blocking simultaneous reads. Streams with the same - block_simultaneous_read value will not be read concurrently. This prevents - duplicate API calls when a stream is used as both a standalone stream and a - parent stream, or when multiple streams share the same endpoint/session. - - If set to a non-empty string, the stream will be deferred if: - 1. Another stream in the same group is currently active - 2. Any parent stream is in an active group - - Examples: - - "issues_endpoint" - All streams with this value block each other - - "" or null - No blocking (default) - - This is useful for APIs that don't allow concurrent access to the same - endpoint or session. Only applies to ConcurrentDeclarativeSource. - type: string - default: "" retriever: title: Retriever description: Component used to coordinate how records are extracted across stream slices and request pages. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 56a32fb5e..29a2c7632 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -2317,6 +2317,23 @@ class Config: ) +class BlockSimultaneousSyncsAction(BaseModel): + type: Literal["BlockSimultaneousSyncsAction"] + + +class StreamGroup(BaseModel): + streams: List[str] = Field( + ..., + description='List of references to streams that belong to this group. Use JSON references to stream definitions (e.g., "#/definitions/my_stream").', + title="Streams", + ) + action: BlockSimultaneousSyncsAction = Field( + ..., + description="The action to apply to streams in this group.", + title="Action", + ) + + class Spec(BaseModel): type: Literal["Spec"] connection_specification: Dict[str, Any] = Field( @@ -2357,6 +2374,11 @@ class Config: spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None api_budget: Optional[HTTPAPIBudget] = None + stream_groups: Optional[Dict[str, StreamGroup]] = Field( + None, + description="Groups of streams that share a common resource and should not be read simultaneously. Each group defines a set of stream references and an action that controls how concurrent reads are managed. Only applies to ConcurrentDeclarativeSource.", + title="Stream Groups", + ) max_concurrent_async_job_count: Optional[Union[int, str]] = Field( None, description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.", @@ -2392,6 +2414,11 @@ class Config: spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None api_budget: Optional[HTTPAPIBudget] = None + stream_groups: Optional[Dict[str, StreamGroup]] = Field( + None, + description="Groups of streams that share a common resource and should not be read simultaneously. Each group defines a set of stream references and an action that controls how concurrent reads are managed. Only applies to ConcurrentDeclarativeSource.", + title="Stream Groups", + ) max_concurrent_async_job_count: Optional[Union[int, str]] = Field( None, description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.", @@ -2507,11 +2534,6 @@ class Config: type: Literal["DeclarativeStream"] name: Optional[str] = Field("", description="The stream name.", example=["Users"], title="Name") - block_simultaneous_read: Optional[str] = Field( - "", - description='Optional group name for blocking simultaneous reads. Streams with the same block_simultaneous_read value will not be read concurrently. This prevents duplicate API calls when a stream is used as both a standalone stream and a parent stream, or when multiple streams share the same endpoint/session.\nIf set to a non-empty string, the stream will be deferred if: 1. Another stream in the same group is currently active 2. Any parent stream is in an active group\nExamples: - "issues_endpoint" - All streams with this value block each other - "" or null - No blocking (default)\nThis is useful for APIs that don\'t allow concurrent access to the same endpoint or session. Only applies to ConcurrentDeclarativeSource.\n', - title="Block Simultaneous Read", - ) retriever: Union[SimpleRetriever, AsyncRetriever, CustomRetriever] = Field( ..., description="Component used to coordinate how records are extracted across stream slices and request pages.", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 2de844580..bd9fbd55f 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2139,7 +2139,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=model.block_simultaneous_read or "", + block_simultaneous_read=getattr(model, "block_simultaneous_read", "") or "", ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index bc65f578e..91b17fc33 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5281,8 +5281,14 @@ def test_catalog_defined_cursor_field_stream_missing(): assert stream._cursor_field.supports_catalog_defined_cursor_field == True -def test_block_simultaneous_read_from_manifest(): - """Test that block_simultaneous_read flows through from manifest to DefaultStream""" +def test_block_simultaneous_read_from_stream_groups(): + """Test that block_simultaneous_read flows through from stream_groups to DefaultStream. + + The stream_groups config is processed by ConcurrentDeclarativeSource which injects + block_simultaneous_read into individual stream configs before passing them to the factory. + This test verifies that the factory correctly reads block_simultaneous_read from the + extra fields on the stream config dict. + """ content = """ parent_stream: type: DeclarativeStream @@ -5382,7 +5388,7 @@ def test_block_simultaneous_read_from_manifest(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream with block_simultaneous_read: true + # Test parent stream with block_simultaneous_read injected (as ConcurrentDeclarativeSource would do) parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["parent_stream"], {} ) @@ -5394,7 +5400,7 @@ def test_block_simultaneous_read_from_manifest(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read: "issues_endpoint" + # Test child stream with block_simultaneous_read injected child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index bde6c35b1..3f10b9eb2 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5150,3 +5150,74 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): with pytest.raises(ValueError): list(source.read(logger=source.logger, config=input_config, catalog=catalog, state=[])) + + +@pytest.mark.parametrize( + "manifest,expected", + [ + pytest.param( + {}, + {}, + id="no_stream_groups", + ), + pytest.param( + {"stream_groups": {}}, + {}, + id="empty_stream_groups", + ), + pytest.param( + { + "stream_groups": { + "crm_objects": { + "streams": [ + {"name": "deals", "type": "DeclarativeStream"}, + {"name": "companies", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + }, + {"deals": "crm_objects", "companies": "crm_objects"}, + id="resolved_stream_refs", + ), + pytest.param( + { + "stream_groups": { + "group_a": { + "streams": [ + {"name": "stream1", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + "group_b": { + "streams": [ + {"name": "stream2", "type": "DeclarativeStream"}, + {"name": "stream3", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + } + }, + {"stream1": "group_a", "stream2": "group_b", "stream3": "group_b"}, + id="multiple_groups", + ), + pytest.param( + { + "stream_groups": { + "fallback_group": { + "streams": [ + "#/definitions/my_stream", + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + }, + {"my_stream": "fallback_group"}, + id="unresolved_string_refs_fallback", + ), + ], +) +def test_build_stream_name_to_group(manifest, expected): + """Test _build_stream_name_to_group correctly maps stream names to group names.""" + result = ConcurrentDeclarativeSource._build_stream_name_to_group(manifest) + assert result == expected From d5ff69e1e5fc015e689793922f06bee638a78809 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 15:23:18 +0000 Subject: [PATCH 11/31] refactor: move stream_name_to_group into ModelToComponentFactory - Add stream_name_to_group parameter to ModelToComponentFactory.__init__() - Add set_stream_name_to_group() method for post-init configuration - Factory now looks up block_simultaneous_read from its own mapping - Remove config injection hack from ConcurrentDeclarativeSource.streams() - Update tests to use factory-based approach instead of extra fields Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 8 +- .../parsers/model_to_component_factory.py | 8 +- .../test_model_to_component_factory.py | 82 ++++++++++++++++--- 3 files changed, 80 insertions(+), 18 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index cdd86fda9..c0ab9e910 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,13 +405,11 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - stream_name_to_group = self._build_stream_name_to_group(self._source_config) + self._constructor.set_stream_name_to_group( + self._build_stream_name_to_group(self._source_config) + ) prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) - for stream_config in prepared_configs: - stream_name = stream_config.get("name", "") - if stream_name in stream_name_to_group: - stream_config["block_simultaneous_read"] = stream_name_to_group[stream_name] source_streams = [ self._constructor.create_component( diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index bd9fbd55f..3733e3857 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -683,6 +683,7 @@ def __init__( max_concurrent_async_job_count: Optional[int] = None, configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, api_budget: Optional[APIBudget] = None, + stream_name_to_group: Optional[Dict[str, str]] = None, ): self._init_mappings() self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice @@ -699,9 +700,14 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) + self._stream_name_to_group: Dict[str, str] = stream_name_to_group or {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] + def set_stream_name_to_group(self, stream_name_to_group: Dict[str, str]) -> None: + """Set the mapping from stream name to group name for block_simultaneous_read.""" + self._stream_name_to_group = stream_name_to_group + def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { AddedFieldDefinitionModel: self.create_added_field_definition, @@ -2139,7 +2145,7 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=getattr(model, "block_simultaneous_read", "") or "", + block_simultaneous_read=self._stream_name_to_group.get(stream_name, ""), ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 91b17fc33..00b9b9270 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5282,19 +5282,17 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that block_simultaneous_read flows through from stream_groups to DefaultStream. + """Test that block_simultaneous_read flows through from stream_name_to_group to DefaultStream. - The stream_groups config is processed by ConcurrentDeclarativeSource which injects - block_simultaneous_read into individual stream configs before passing them to the factory. - This test verifies that the factory correctly reads block_simultaneous_read from the - extra fields on the stream config dict. + The stream_groups config is parsed by ConcurrentDeclarativeSource into a stream_name_to_group + mapping, which is then set on the ModelToComponentFactory. The factory uses this mapping to + look up the group for each stream it creates. """ content = """ parent_stream: type: DeclarativeStream name: "parent" primary_key: "id" - block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5322,7 +5320,6 @@ def test_block_simultaneous_read_from_stream_groups(): type: DeclarativeStream name: "child" primary_key: "id" - block_simultaneous_read: "issues_endpoint" retriever: type: SimpleRetriever requester: @@ -5385,14 +5382,19 @@ def test_block_simultaneous_read_from_stream_groups(): config = {"api_key": "test_key"} + # Create a factory with stream_name_to_group mapping (as ConcurrentDeclarativeSource would do) + factory_with_groups = ModelToComponentFactory( + stream_name_to_group={"parent": "issues_endpoint", "child": "issues_endpoint"} + ) + parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream with block_simultaneous_read injected (as ConcurrentDeclarativeSource would do) + # Test parent stream gets block_simultaneous_read from the factory's stream_name_to_group parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["parent_stream"], {} ) - parent_stream: DefaultStream = factory.create_component( + parent_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config ) @@ -5400,11 +5402,11 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream with block_simultaneous_read injected + # Test child stream gets block_simultaneous_read from the factory's stream_name_to_group child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["child_stream"], {} ) - child_stream: DefaultStream = factory.create_component( + child_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config ) @@ -5416,7 +5418,7 @@ def test_block_simultaneous_read_from_stream_groups(): no_block_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["no_block_stream"], {} ) - no_block_stream: DefaultStream = factory.create_component( + no_block_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config ) @@ -5425,6 +5427,62 @@ def test_block_simultaneous_read_from_stream_groups(): assert no_block_stream.block_simultaneous_read == "" +def test_set_stream_name_to_group(): + """Test that set_stream_name_to_group updates the factory's stream_name_to_group mapping.""" + content = """ + test_stream: + type: DeclarativeStream + name: "test" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/test" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + """ + + config = {"api_key": "test_key"} + + # Create factory without stream_name_to_group + test_factory = ModelToComponentFactory() + + parsed_manifest = YamlDeclarativeSource._parse(content) + resolved_manifest = resolver.preprocess_manifest(parsed_manifest) + stream_manifest = transformer.propagate_types_and_parameters( + "", resolved_manifest["test_stream"], {} + ) + + # Without stream_name_to_group, block_simultaneous_read should be empty + stream: DefaultStream = test_factory.create_component( + model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config + ) + assert stream.block_simultaneous_read == "" + + # After setting stream_name_to_group, block_simultaneous_read should be populated + test_factory.set_stream_name_to_group({"test": "my_group"}) + stream = test_factory.create_component( + model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config + ) + assert stream.block_simultaneous_read == "my_group" + + def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, From 5ed978f08130d16e24abbba93261ded21032c3b6 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 25 Feb 2026 16:26:09 +0000 Subject: [PATCH 12/31] refactor: use stream_groups manifest in factory test instead of hardcoded dict - Test now defines stream_groups with references in the manifest YAML - Uses _build_stream_name_to_group() to derive the mapping from manifest - Removed test_set_stream_name_to_group (redundant with the manifest-based test) - Added ConcurrentDeclarativeSource import for _build_stream_name_to_group Co-Authored-By: unknown <> --- .../test_model_to_component_factory.py | 273 +++++++----------- 1 file changed, 111 insertions(+), 162 deletions(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 00b9b9270..ddedf8221 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -48,6 +48,9 @@ ) from airbyte_cdk.sources.declarative.checks import CheckStream from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel +from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( + ConcurrentDeclarativeSource, +) from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime from airbyte_cdk.sources.declarative.decoders import JsonDecoder, PaginationDecoderDecorator from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector @@ -5282,117 +5285,119 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that block_simultaneous_read flows through from stream_name_to_group to DefaultStream. - - The stream_groups config is parsed by ConcurrentDeclarativeSource into a stream_name_to_group - mapping, which is then set on the ModelToComponentFactory. The factory uses this mapping to - look up the group for each stream it creates. - """ + """Test that stream_groups in the manifest flow through to DefaultStream.block_simultaneous_read.""" content = """ - parent_stream: - type: DeclarativeStream - name: "parent" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/parent" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - - child_stream: - type: DeclarativeStream - name: "child" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/child" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - partition_router: - type: SubstreamPartitionRouter - parent_stream_configs: - - type: ParentStreamConfig - stream: "#/parent_stream" - parent_key: "id" - partition_field: "parent_id" - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - parent_id: - type: string - - no_block_stream: - type: DeclarativeStream - name: "no_block" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/no_block" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string + definitions: + parent_stream: + type: DeclarativeStream + name: "parent" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/parent" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + child_stream: + type: DeclarativeStream + name: "child" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/child" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + partition_router: + type: SubstreamPartitionRouter + parent_stream_configs: + - type: ParentStreamConfig + stream: "#/definitions/parent_stream" + parent_key: "id" + partition_field: "parent_id" + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + parent_id: + type: string + + no_block_stream: + type: DeclarativeStream + name: "no_block" + primary_key: "id" + retriever: + type: SimpleRetriever + requester: + type: HttpRequester + url_base: "https://api.example.com" + path: "/no_block" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['api_key'] }}" + record_selector: + type: RecordSelector + extractor: + type: DpathExtractor + field_path: [] + schema_loader: + type: InlineSchemaLoader + schema: + type: object + properties: + id: + type: string + + stream_groups: + issues_endpoint: + streams: + - "#/definitions/parent_stream" + - "#/definitions/child_stream" + action: BlockSimultaneousSyncsAction """ config = {"api_key": "test_key"} - # Create a factory with stream_name_to_group mapping (as ConcurrentDeclarativeSource would do) - factory_with_groups = ModelToComponentFactory( - stream_name_to_group={"parent": "issues_endpoint", "child": "issues_endpoint"} - ) - parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Test parent stream gets block_simultaneous_read from the factory's stream_name_to_group + # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) + stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group(resolved_manifest) + factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) + + # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["parent_stream"], {} + "", resolved_manifest["definitions"]["parent_stream"], {} ) parent_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config @@ -5402,9 +5407,9 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream gets block_simultaneous_read from the factory's stream_name_to_group + # Test child stream gets block_simultaneous_read from stream_groups child_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["child_stream"], {} + "", resolved_manifest["definitions"]["child_stream"], {} ) child_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config @@ -5414,9 +5419,9 @@ def test_block_simultaneous_read_from_stream_groups(): assert child_stream.name == "child" assert child_stream.block_simultaneous_read == "issues_endpoint" - # Test stream without block_simultaneous_read (should default to empty string) + # Test stream not in any group defaults to empty string no_block_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["no_block_stream"], {} + "", resolved_manifest["definitions"]["no_block_stream"], {} ) no_block_stream: DefaultStream = factory_with_groups.create_component( model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config @@ -5427,62 +5432,6 @@ def test_block_simultaneous_read_from_stream_groups(): assert no_block_stream.block_simultaneous_read == "" -def test_set_stream_name_to_group(): - """Test that set_stream_name_to_group updates the factory's stream_name_to_group mapping.""" - content = """ - test_stream: - type: DeclarativeStream - name: "test" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/test" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - """ - - config = {"api_key": "test_key"} - - # Create factory without stream_name_to_group - test_factory = ModelToComponentFactory() - - parsed_manifest = YamlDeclarativeSource._parse(content) - resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - stream_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["test_stream"], {} - ) - - # Without stream_name_to_group, block_simultaneous_read should be empty - stream: DefaultStream = test_factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config - ) - assert stream.block_simultaneous_read == "" - - # After setting stream_name_to_group, block_simultaneous_read should be populated - test_factory.set_stream_name_to_group({"test": "my_group"}) - stream = test_factory.create_component( - model_type=DeclarativeStreamModel, component_definition=stream_manifest, config=config - ) - assert stream.block_simultaneous_read == "my_group" - - def get_schema_loader(stream: DefaultStream): assert isinstance( stream._stream_partition_generator._partition_factory._schema_loader, From b41a4b3df6ec77a4c1a1388dd97dea404afab280 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:09:17 +0000 Subject: [PATCH 13/31] fix: only include parent stream in stream_groups to avoid deadlock Child streams that depend on parent streams should not be in the same group, as this would cause a deadlock (child needs to read parent). Co-Authored-By: unknown <> --- .../declarative/parsers/test_model_to_component_factory.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index ddedf8221..964e3cd4a 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5382,7 +5382,6 @@ def test_block_simultaneous_read_from_stream_groups(): issues_endpoint: streams: - "#/definitions/parent_stream" - - "#/definitions/child_stream" action: BlockSimultaneousSyncsAction """ @@ -5407,7 +5406,7 @@ def test_block_simultaneous_read_from_stream_groups(): assert parent_stream.name == "parent" assert parent_stream.block_simultaneous_read == "issues_endpoint" - # Test child stream gets block_simultaneous_read from stream_groups + # Test child stream is NOT in the group (to avoid deadlock with parent) child_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["definitions"]["child_stream"], {} ) @@ -5417,7 +5416,7 @@ def test_block_simultaneous_read_from_stream_groups(): assert isinstance(child_stream, DefaultStream) assert child_stream.name == "child" - assert child_stream.block_simultaneous_read == "issues_endpoint" + assert child_stream.block_simultaneous_read == "" # Test stream not in any group defaults to empty string no_block_manifest = transformer.propagate_types_and_parameters( From a231e8c2373c1bacfc148e0dd372b2b398c34661 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:20:54 +0000 Subject: [PATCH 14/31] style: fix ruff format for long line Co-Authored-By: unknown <> --- .../declarative/parsers/test_model_to_component_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 964e3cd4a..bf877ff1a 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5391,7 +5391,9 @@ def test_block_simultaneous_read_from_stream_groups(): resolved_manifest = resolver.preprocess_manifest(parsed_manifest) # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) - stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group(resolved_manifest) + stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group( + resolved_manifest + ) factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) # Test parent stream gets block_simultaneous_read from stream_groups From d3c806735b9a7eb9e00cba3f67c118893339e521 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 27 Feb 2026 15:07:31 +0000 Subject: [PATCH 15/31] refactor: move _build_stream_name_to_group into ModelToComponentFactory - Factory now owns the stream_groups resolution via set_stream_groups(manifest) - ConcurrentDeclarativeSource just calls factory.set_stream_groups(manifest) - Removed _build_stream_name_to_group from ConcurrentDeclarativeSource - Updated tests to use factory's _build_stream_name_to_group directly Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 36 +-------------- .../parsers/model_to_component_factory.py | 45 ++++++++++++++++--- .../test_model_to_component_factory.py | 11 ++--- .../test_concurrent_declarative_source.py | 5 ++- 4 files changed, 48 insertions(+), 49 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index c0ab9e910..2eeee093e 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -405,9 +405,7 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - self._constructor.set_stream_name_to_group( - self._build_stream_name_to_group(self._source_config) - ) + self._constructor.set_stream_groups(self._source_config) prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) @@ -532,38 +530,6 @@ def dynamic_streams(self) -> List[Dict[str, Any]]: with_dynamic_stream_name=True, ) - @staticmethod - def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: - """Build a mapping from stream name to group name based on the stream_groups manifest config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - - Returns: - A dict mapping stream name -> group name for streams that belong to a group. - """ - stream_name_to_group: Dict[str, str] = {} - stream_groups = manifest.get("stream_groups", {}) - if not stream_groups: - return stream_name_to_group - - for group_name, group_config in stream_groups.items(): - streams = group_config.get("streams", []) - for stream_ref in streams: - if isinstance(stream_ref, dict): - # After reference resolution, stream_ref is a full stream definition dict - stream_name = stream_ref.get("name", "") - if stream_name: - stream_name_to_group[stream_name] = group_name - elif isinstance(stream_ref, str): - # If not resolved (shouldn't happen normally), extract name from ref path - # e.g., "#/definitions/my_stream" -> "my_stream" - if stream_ref.startswith("#/definitions/"): - stream_name = stream_ref.split("/")[-1] - stream_name_to_group[stream_name] = group_name - - return stream_name_to_group - def _stream_configs(self, manifest: Mapping[str, Any]) -> List[Dict[str, Any]]: # This has a warning flag for static, but after we finish part 4 we'll replace manifest with self._source_config stream_configs = [] diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 3733e3857..48203d545 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -683,7 +683,6 @@ def __init__( max_concurrent_async_job_count: Optional[int] = None, configured_catalog: Optional[ConfiguredAirbyteCatalog] = None, api_budget: Optional[APIBudget] = None, - stream_name_to_group: Optional[Dict[str, str]] = None, ): self._init_mappings() self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice @@ -700,13 +699,49 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) - self._stream_name_to_group: Dict[str, str] = stream_name_to_group or {} + self._stream_name_to_group: Dict[str, str] = {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] - def set_stream_name_to_group(self, stream_name_to_group: Dict[str, str]) -> None: - """Set the mapping from stream name to group name for block_simultaneous_read.""" - self._stream_name_to_group = stream_name_to_group + def set_stream_groups(self, manifest: Mapping[str, Any]) -> None: + """Build and set the stream-name-to-group mapping from the manifest's stream_groups config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + """ + self._stream_name_to_group = self._build_stream_name_to_group(manifest) + + @staticmethod + def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: + """Build a mapping from stream name to group name based on the stream_groups manifest config. + + After manifest reference resolution, each stream reference in stream_groups.streams + is resolved to the full stream definition dict containing a 'name' field. + + Returns: + A dict mapping stream name -> group name for streams that belong to a group. + """ + stream_name_to_group: Dict[str, str] = {} + stream_groups = manifest.get("stream_groups", {}) + if not stream_groups: + return stream_name_to_group + + for group_name, group_config in stream_groups.items(): + streams = group_config.get("streams", []) + for stream_ref in streams: + if isinstance(stream_ref, dict): + # After reference resolution, stream_ref is a full stream definition dict + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + elif isinstance(stream_ref, str): + # If not resolved (shouldn't happen normally), extract name from ref path + # e.g., "#/definitions/my_stream" -> "my_stream" + if stream_ref.startswith("#/definitions/"): + stream_name = stream_ref.split("/")[-1] + stream_name_to_group[stream_name] = group_name + + return stream_name_to_group def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index bf877ff1a..d27d23613 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -48,9 +48,6 @@ ) from airbyte_cdk.sources.declarative.checks import CheckStream from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel -from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( - ConcurrentDeclarativeSource, -) from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime from airbyte_cdk.sources.declarative.decoders import JsonDecoder, PaginationDecoderDecorator from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector @@ -5390,11 +5387,9 @@ def test_block_simultaneous_read_from_stream_groups(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Build stream_name_to_group from the manifest's stream_groups (as ConcurrentDeclarativeSource does) - stream_name_to_group = ConcurrentDeclarativeSource._build_stream_name_to_group( - resolved_manifest - ) - factory_with_groups = ModelToComponentFactory(stream_name_to_group=stream_name_to_group) + # Use the factory's set_stream_groups to resolve stream_groups from the manifest + factory_with_groups = ModelToComponentFactory() + factory_with_groups.set_stream_groups(resolved_manifest) # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 3f10b9eb2..7676f8e2d 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -56,6 +56,9 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import ( ClientSideIncrementalRecordFilterDecorator, ) +from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( + ModelToComponentFactory, +) from airbyte_cdk.sources.declarative.partition_routers import AsyncJobPartitionRouter from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( @@ -5219,5 +5222,5 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): ) def test_build_stream_name_to_group(manifest, expected): """Test _build_stream_name_to_group correctly maps stream names to group names.""" - result = ConcurrentDeclarativeSource._build_stream_name_to_group(manifest) + result = ModelToComponentFactory._build_stream_name_to_group(manifest) assert result == expected From 3a3cf8b739421d640485b8e8b81eda55ffa97ba8 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:56:59 +0000 Subject: [PATCH 16/31] refactor: resolve stream_groups from actual stream instances instead of factory - Removed _build_stream_name_to_group, set_stream_groups, _stream_name_to_group from factory - Factory no longer knows about stream_groups at all - Added _apply_stream_groups to ConcurrentDeclarativeSource: creates streams first, then sets block_simultaneous_read on matching DefaultStream instances - Added block_simultaneous_read setter on DefaultStream - Replaced mock-based tests with parametrized tests using real DefaultStream instances Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 30 ++++- .../parsers/model_to_component_factory.py | 42 ------- .../streams/concurrent/default_stream.py | 4 + .../test_model_to_component_factory.py | 109 ++---------------- .../test_concurrent_declarative_source.py | 69 ++++++----- 5 files changed, 81 insertions(+), 173 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 2eeee093e..fae94c7b7 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -82,6 +82,7 @@ from airbyte_cdk.sources.message.concurrent_repository import ConcurrentMessageRepository from airbyte_cdk.sources.message.repository import InMemoryMessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem from airbyte_cdk.sources.utils.slice_logger import ( AlwaysLogSliceLogger, @@ -405,8 +406,6 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i if api_budget_model: self._constructor.set_api_budget(api_budget_model, self._config) - self._constructor.set_stream_groups(self._source_config) - prepared_configs = self._initialize_cache_for_parent_streams(deepcopy(stream_configs)) source_streams = [ @@ -422,8 +421,35 @@ def streams(self, config: Mapping[str, Any]) -> List[AbstractStream]: # type: i ) for stream_config in prepared_configs ] + + self._apply_stream_groups(source_streams) + return source_streams + def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: + """Set block_simultaneous_read on streams based on the manifest's stream_groups config. + + Iterates over the resolved manifest's stream_groups and matches group membership + against actual created stream instances by name. + """ + stream_groups = self._source_config.get("stream_groups", {}) + if not stream_groups: + return + + # Build stream_name -> group_name mapping from the resolved manifest + stream_name_to_group: Dict[str, str] = {} + for group_name, group_config in stream_groups.items(): + for stream_ref in group_config.get("streams", []): + if isinstance(stream_ref, dict): + stream_name = stream_ref.get("name", "") + if stream_name: + stream_name_to_group[stream_name] = group_name + + # Apply group to matching stream instances + for stream in streams: + if isinstance(stream, DefaultStream) and stream.name in stream_name_to_group: + stream.block_simultaneous_read = stream_name_to_group[stream.name] + @staticmethod def _initialize_cache_for_parent_streams( stream_configs: List[Dict[str, Any]], diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 48203d545..2bd7d268d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -699,50 +699,9 @@ def __init__( self._connector_state_manager = connector_state_manager or ConnectorStateManager() self._api_budget: Optional[Union[APIBudget]] = api_budget self._job_tracker: JobTracker = JobTracker(max_concurrent_async_job_count or 1) - self._stream_name_to_group: Dict[str, str] = {} # placeholder for deprecation warnings self._collected_deprecation_logs: List[ConnectorBuilderLogMessage] = [] - def set_stream_groups(self, manifest: Mapping[str, Any]) -> None: - """Build and set the stream-name-to-group mapping from the manifest's stream_groups config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - """ - self._stream_name_to_group = self._build_stream_name_to_group(manifest) - - @staticmethod - def _build_stream_name_to_group(manifest: Mapping[str, Any]) -> Dict[str, str]: - """Build a mapping from stream name to group name based on the stream_groups manifest config. - - After manifest reference resolution, each stream reference in stream_groups.streams - is resolved to the full stream definition dict containing a 'name' field. - - Returns: - A dict mapping stream name -> group name for streams that belong to a group. - """ - stream_name_to_group: Dict[str, str] = {} - stream_groups = manifest.get("stream_groups", {}) - if not stream_groups: - return stream_name_to_group - - for group_name, group_config in stream_groups.items(): - streams = group_config.get("streams", []) - for stream_ref in streams: - if isinstance(stream_ref, dict): - # After reference resolution, stream_ref is a full stream definition dict - stream_name = stream_ref.get("name", "") - if stream_name: - stream_name_to_group[stream_name] = group_name - elif isinstance(stream_ref, str): - # If not resolved (shouldn't happen normally), extract name from ref path - # e.g., "#/definitions/my_stream" -> "my_stream" - if stream_ref.startswith("#/definitions/"): - stream_name = stream_ref.split("/")[-1] - stream_name_to_group[stream_name] = group_name - - return stream_name_to_group - def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { AddedFieldDefinitionModel: self.create_added_field_definition, @@ -2180,7 +2139,6 @@ def create_default_stream( logger=logging.getLogger(f"airbyte.{stream_name}"), cursor=concurrent_cursor, supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader), - block_simultaneous_read=self._stream_name_to_group.get(stream_name, ""), ) def _migrate_state(self, model: DeclarativeStreamModel, config: Config) -> None: diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 5ca11eaf8..d679fe0ee 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -101,6 +101,10 @@ def block_simultaneous_read(self) -> str: """Returns the blocking group name for this stream, or empty string if no blocking""" return self._block_simultaneous_read + @block_simultaneous_read.setter + def block_simultaneous_read(self, value: str) -> None: + self._block_simultaneous_read = value + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index d27d23613..add9a1c42 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -5282,7 +5282,12 @@ def test_catalog_defined_cursor_field_stream_missing(): def test_block_simultaneous_read_from_stream_groups(): - """Test that stream_groups in the manifest flow through to DefaultStream.block_simultaneous_read.""" + """Test that factory-created streams default to empty block_simultaneous_read. + + The factory no longer handles stream_groups — that's done by + ConcurrentDeclarativeSource._apply_stream_groups after stream creation. + This test verifies the factory creates streams without group info. + """ content = """ definitions: parent_stream: @@ -5311,75 +5316,6 @@ def test_block_simultaneous_read_from_stream_groups(): properties: id: type: string - - child_stream: - type: DeclarativeStream - name: "child" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/child" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - partition_router: - type: SubstreamPartitionRouter - parent_stream_configs: - - type: ParentStreamConfig - stream: "#/definitions/parent_stream" - parent_key: "id" - partition_field: "parent_id" - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - parent_id: - type: string - - no_block_stream: - type: DeclarativeStream - name: "no_block" - primary_key: "id" - retriever: - type: SimpleRetriever - requester: - type: HttpRequester - url_base: "https://api.example.com" - path: "/no_block" - http_method: "GET" - authenticator: - type: BearerAuthenticator - api_token: "{{ config['api_key'] }}" - record_selector: - type: RecordSelector - extractor: - type: DpathExtractor - field_path: [] - schema_loader: - type: InlineSchemaLoader - schema: - type: object - properties: - id: - type: string - - stream_groups: - issues_endpoint: - streams: - - "#/definitions/parent_stream" - action: BlockSimultaneousSyncsAction """ config = {"api_key": "test_key"} @@ -5387,45 +5323,18 @@ def test_block_simultaneous_read_from_stream_groups(): parsed_manifest = YamlDeclarativeSource._parse(content) resolved_manifest = resolver.preprocess_manifest(parsed_manifest) - # Use the factory's set_stream_groups to resolve stream_groups from the manifest - factory_with_groups = ModelToComponentFactory() - factory_with_groups.set_stream_groups(resolved_manifest) + factory = ModelToComponentFactory() - # Test parent stream gets block_simultaneous_read from stream_groups parent_manifest = transformer.propagate_types_and_parameters( "", resolved_manifest["definitions"]["parent_stream"], {} ) - parent_stream: DefaultStream = factory_with_groups.create_component( + parent_stream: DefaultStream = factory.create_component( model_type=DeclarativeStreamModel, component_definition=parent_manifest, config=config ) assert isinstance(parent_stream, DefaultStream) assert parent_stream.name == "parent" - assert parent_stream.block_simultaneous_read == "issues_endpoint" - - # Test child stream is NOT in the group (to avoid deadlock with parent) - child_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["definitions"]["child_stream"], {} - ) - child_stream: DefaultStream = factory_with_groups.create_component( - model_type=DeclarativeStreamModel, component_definition=child_manifest, config=config - ) - - assert isinstance(child_stream, DefaultStream) - assert child_stream.name == "child" - assert child_stream.block_simultaneous_read == "" - - # Test stream not in any group defaults to empty string - no_block_manifest = transformer.propagate_types_and_parameters( - "", resolved_manifest["definitions"]["no_block_stream"], {} - ) - no_block_stream: DefaultStream = factory_with_groups.create_component( - model_type=DeclarativeStreamModel, component_definition=no_block_manifest, config=config - ) - - assert isinstance(no_block_stream, DefaultStream) - assert no_block_stream.name == "no_block" - assert no_block_stream.block_simultaneous_read == "" + assert parent_stream.block_simultaneous_read == "" def get_schema_loader(stream: DefaultStream): diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 7676f8e2d..0e07b20f0 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -56,14 +56,12 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import ( ClientSideIncrementalRecordFilterDecorator, ) -from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( - ModelToComponentFactory, -) from airbyte_cdk.sources.declarative.partition_routers import AsyncJobPartitionRouter from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, ) +from airbyte_cdk.sources.message.repository import InMemoryMessageRepository from airbyte_cdk.sources.streams import Stream from airbyte_cdk.sources.streams.checkpoint import Cursor from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor @@ -5155,17 +5153,37 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): list(source.read(logger=source.logger, config=input_config, catalog=catalog, state=[])) +def _make_default_stream(name: str) -> DefaultStream: + """Create a minimal DefaultStream instance for testing.""" + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + + cursor = FinalStateCursor( + stream_name=name, stream_namespace=None, message_repository=InMemoryMessageRepository() + ) + return DefaultStream( + partition_generator=Mock(), + name=name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{name}"), + cursor=cursor, + ) + + @pytest.mark.parametrize( - "manifest,expected", + "source_config,stream_names,expected_groups", [ pytest.param( {}, - {}, + ["my_stream"], + {"my_stream": ""}, id="no_stream_groups", ), pytest.param( {"stream_groups": {}}, - {}, + ["my_stream"], + {"my_stream": ""}, id="empty_stream_groups", ), pytest.param( @@ -5180,16 +5198,15 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): } } }, - {"deals": "crm_objects", "companies": "crm_objects"}, - id="resolved_stream_refs", + ["deals", "companies", "no_group"], + {"deals": "crm_objects", "companies": "crm_objects", "no_group": ""}, + id="single_group_with_unmatched_stream", ), pytest.param( { "stream_groups": { "group_a": { - "streams": [ - {"name": "stream1", "type": "DeclarativeStream"}, - ], + "streams": [{"name": "stream1", "type": "DeclarativeStream"}], "action": {"type": "BlockSimultaneousSyncsAction"}, }, "group_b": { @@ -5201,26 +5218,20 @@ def test_given_record_selector_is_filtering_when_read_then_raise_error(): }, } }, + ["stream1", "stream2", "stream3"], {"stream1": "group_a", "stream2": "group_b", "stream3": "group_b"}, id="multiple_groups", ), - pytest.param( - { - "stream_groups": { - "fallback_group": { - "streams": [ - "#/definitions/my_stream", - ], - "action": {"type": "BlockSimultaneousSyncsAction"}, - } - } - }, - {"my_stream": "fallback_group"}, - id="unresolved_string_refs_fallback", - ), ], ) -def test_build_stream_name_to_group(manifest, expected): - """Test _build_stream_name_to_group correctly maps stream names to group names.""" - result = ModelToComponentFactory._build_stream_name_to_group(manifest) - assert result == expected +def test_apply_stream_groups(source_config, stream_names, expected_groups): + """Test _apply_stream_groups sets block_simultaneous_read on matching stream instances.""" + streams = [_make_default_stream(name) for name in stream_names] + + source = Mock() + source._source_config = source_config + + ConcurrentDeclarativeSource._apply_stream_groups(source, streams) + + for stream in streams: + assert stream.block_simultaneous_read == expected_groups[stream.name] From 7ccf6cff0f206af9e9fb8eaeafa23aed1219aa10 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 4 Mar 2026 18:40:21 +0200 Subject: [PATCH 17/31] Fix stream format in schema --- .../sources/concurrent_source/concurrent_read_processor.py | 4 ++-- .../sources/declarative/declarative_component_schema.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 372430be1..5be8afaa3 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -258,7 +258,8 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: stream_name = stream.name stream_group = self._stream_block_simultaneous_read.get(stream_name, "") - # Check if this stream has a blocking group and is already active + # Check if this stream has a blocking group and is already active as parent stream + # (i.e. being read from during partition generation for another stream) if stream_group and stream_name in self._active_stream_names: # Add back to the END of the queue for retry later self._stream_instances_to_start_partition_generation.append(stream) @@ -316,7 +317,6 @@ def start_next_partition_generator(self) -> Optional[AirbyteMessage]: self._logger.debug(f"Added '{stream_name}' to active group '{stream_group}'") # Also mark all parent streams as active (they will be read from during partition generation) - parent_streams = self._collect_all_parent_stream_names(stream_name) for parent_stream_name in parent_streams: parent_group = self._stream_block_simultaneous_read.get(parent_stream_name, "") if parent_group: diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 3f10a1bf5..8d0d8d4fa 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -85,11 +85,11 @@ definitions: streams: title: Streams description: > - List of references to streams that belong to this group. Use JSON references - to stream definitions (e.g., "#/definitions/my_stream"). + List of references to streams that belong to this group. type: array items: - type: string + anyOf: + - "$ref": "#/definitions/DeclarativeStream" action: title: Action description: The action to apply to streams in this group. From 8d3b1f2057b409bfceeed34d87ded6a4ea734cc9 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:44:09 +0000 Subject: [PATCH 18/31] refactor: add get_partition_router() helper to DefaultStream Replace hasattr chain in ConcurrentReadProcessor._collect_all_parent_stream_names with DefaultStream.get_partition_router() that safely traverses the internal partition_generator -> stream_slicer -> partition_router chain using isinstance checks. Co-Authored-By: unknown <> --- .../concurrent_read_processor.py | 38 +++++++------------ .../streams/concurrent/default_stream.py | 21 +++++++++- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 5be8afaa3..69bbf8af2 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -371,42 +371,30 @@ def _is_stream_done(self, stream_name: str) -> bool: return stream_name in self._streams_done def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: - """ - Recursively collect all parent stream names for a given stream. - For example, if we have: epics -> issues -> comments - Then for comments, this returns {issues, epics} + """Recursively collect all parent stream names for a given stream. - :param stream_name: The stream to collect parents for - :return: Set of all parent stream names (recursively) + For example, if we have: epics -> issues -> comments + Then for comments, this returns {issues, epics}. """ + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream + parent_names: Set[str] = set() stream = self._stream_name_to_instance.get(stream_name) if not stream: return parent_names - # Get partition router if it exists (this is where parent streams are defined) - partition_router = None - - # Try DefaultStream path first (_stream_partition_generator._stream_slicer._partition_router) - if ( - hasattr(stream, "_stream_partition_generator") - and hasattr(stream._stream_partition_generator, "_stream_slicer") - and hasattr(stream._stream_partition_generator._stream_slicer, "_partition_router") - ): - partition_router = stream._stream_partition_generator._stream_slicer._partition_router - # Fallback to legacy path (retriever.partition_router) for backward compatibility and test mocks - elif hasattr(stream, "retriever") and hasattr(stream.retriever, "partition_router"): - partition_router = stream.retriever.partition_router + partition_router = ( + stream.get_partition_router() if isinstance(stream, DefaultStream) else None + ) - # SubstreamPartitionRouter has parent_stream_configs - if partition_router and hasattr(partition_router, "parent_stream_configs"): + if isinstance(partition_router, SubstreamPartitionRouter): for parent_config in partition_router.parent_stream_configs: - parent_stream = parent_config.stream - parent_name = parent_stream.name + parent_name = parent_config.stream.name parent_names.add(parent_name) - - # Recursively collect grandparents, great-grandparents, etc. parent_names.update(self._collect_all_parent_stream_names(parent_name)) return parent_names diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index d679fe0ee..4c7267315 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -3,7 +3,10 @@ # from logging import Logger -from typing import Any, Callable, Iterable, List, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Mapping, Optional, Union + +if TYPE_CHECKING: + from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream @@ -105,6 +108,22 @@ def block_simultaneous_read(self) -> str: def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value + def get_partition_router(self) -> "PartitionRouter | None": + """Return the partition router for this stream, or None if not available.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + StreamSlicerPartitionGenerator, + ) + + if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): + return None + stream_slicer = self._stream_partition_generator._stream_slicer + if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): + return None + return stream_slicer._partition_router + def check_availability(self) -> StreamAvailability: """ Check stream availability by attempting to read the first record of the stream. From 77b25c5d5f643bc0fcb8ff539c25a5e5e2408324 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:52:33 +0000 Subject: [PATCH 19/31] feat: validate no parent-child streams share a group to prevent deadlock _apply_stream_groups now checks that no stream shares a group with any of its parent streams (via get_partition_router). Raises ValueError at config time if a deadlock-causing configuration is detected. Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 24 +++- .../test_concurrent_declarative_source.py | 116 ++++++++++++++++++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index fae94c7b7..15fa40fe4 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -430,8 +430,13 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: """Set block_simultaneous_read on streams based on the manifest's stream_groups config. Iterates over the resolved manifest's stream_groups and matches group membership - against actual created stream instances by name. + against actual created stream instances by name. Validates that no stream shares a + group with any of its parent streams, which would cause a deadlock. """ + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + stream_groups = self._source_config.get("stream_groups", {}) if not stream_groups: return @@ -445,6 +450,23 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if stream_name: stream_name_to_group[stream_name] = group_name + # Validate no stream shares a group with its parent streams + for stream in streams: + if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: + continue + partition_router = stream.get_partition_router() + if not isinstance(partition_router, SubstreamPartitionRouter): + continue + group_name = stream_name_to_group[stream.name] + for parent_config in partition_router.parent_stream_configs: + parent_name = parent_config.stream.name + if stream_name_to_group.get(parent_name) == group_name: + raise ValueError( + f"Stream '{stream.name}' and its parent stream '{parent_name}' " + f"are both in group '{group_name}'. " + f"A child stream must not share a group with its parent to avoid deadlock." + ) + # Apply group to matching stream instances for stream in streams: if isinstance(stream, DefaultStream) and stream.name in stream_name_to_group: diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index 0e07b20f0..f1b4143a3 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5171,6 +5171,75 @@ def _make_default_stream(name: str) -> DefaultStream: ) +def _make_child_stream_with_parent(child_name: str, parent_stream: DefaultStream) -> DefaultStream: + """Create a DefaultStream that has a SubstreamPartitionRouter pointing to parent_stream.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentCursorFactory, + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + ParentStreamConfig, + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, + StreamSlicerPartitionGenerator, + ) + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + EpochValueConcurrentStreamStateConverter, + ) + + partition_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=parent_stream, + parent_key="id", + partition_field="parent_id", + config={}, + parameters={}, + ) + ], + config={}, + parameters={}, + ) + + cursor_factory = ConcurrentCursorFactory(lambda *args, **kwargs: Mock()) + message_repository = InMemoryMessageRepository() + state_converter = EpochValueConcurrentStreamStateConverter() + + per_partition_cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory, + partition_router=partition_router, + stream_name=child_name, + stream_namespace=None, + stream_state={}, + message_repository=message_repository, + connector_state_manager=Mock(), + connector_state_converter=state_converter, + cursor_field=Mock(cursor_field_key="updated_at"), + ) + + partition_factory = Mock(spec=DeclarativePartitionFactory) + partition_generator = StreamSlicerPartitionGenerator( + partition_factory=partition_factory, + stream_slicer=per_partition_cursor, + ) + + cursor = FinalStateCursor( + stream_name=child_name, stream_namespace=None, message_repository=message_repository + ) + return DefaultStream( + partition_generator=partition_generator, + name=child_name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{child_name}"), + cursor=cursor, + ) + + @pytest.mark.parametrize( "source_config,stream_names,expected_groups", [ @@ -5235,3 +5304,50 @@ def test_apply_stream_groups(source_config, stream_names, expected_groups): for stream in streams: assert stream.block_simultaneous_read == expected_groups[stream.name] + + +def test_apply_stream_groups_raises_on_parent_child_in_same_group(): + """Test _apply_stream_groups raises ValueError when a child and its parent are in the same group.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "parent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + +def test_apply_stream_groups_allows_parent_child_in_different_groups(): + """Test _apply_stream_groups allows a child and its parent in different groups.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "group_a": { + "streams": [{"name": "parent_stream", "type": "DeclarativeStream"}], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + "group_b": { + "streams": [{"name": "child_stream", "type": "DeclarativeStream"}], + "action": {"type": "BlockSimultaneousSyncsAction"}, + }, + } + } + + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + assert parent.block_simultaneous_read == "group_a" + assert child.block_simultaneous_read == "group_b" From 3014458bce579169b054b0ebd63915e6edf77f5d Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:09:13 +0000 Subject: [PATCH 20/31] feat: assert partition generation queue is empty when all streams are done Adds a safety check in is_done() that raises AirbyteTracedException (system_error) if streams remain in the partition generation queue after all streams are marked done. Also moves inline imports to module level and updates test mocks to use DefaultStream with get_partition_router(). Co-Authored-By: unknown <> --- .../concurrent_read_processor.py | 18 ++++-- .../test_concurrent_read_processor.py | 60 ++++++++++++++++--- 2 files changed, 65 insertions(+), 13 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 69bbf8af2..9a8f7d942 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -13,8 +13,12 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.message import MessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -354,6 +358,15 @@ def is_done(self) -> bool: for stream_name in self._stream_name_to_instance.keys() ] ) + if is_done and self._stream_instances_to_start_partition_generation: + stuck_stream_names = [ + s.name for s in self._stream_instances_to_start_partition_generation + ] + raise AirbyteTracedException( + message="Partition generation queue is not empty after all streams completed.", + internal_message=f"Streams {stuck_stream_names} remained in the partition generation queue after all streams were marked done.", + failure_type=FailureType.system_error, + ) if is_done and self._exceptions_per_stream_name: error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name) self._logger.info(error_message) @@ -376,11 +389,6 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: For example, if we have: epics -> issues -> comments Then for comments, this returns {issues, epics}. """ - from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( - SubstreamPartitionRouter, - ) - from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream - parent_names: Set[str] = set() stream = self._stream_name_to_instance.get(stream_name) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 788478ae6..4cdc72d45 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -28,8 +28,12 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.message import LogMessage, MessageRepository from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream +from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition @@ -822,18 +826,22 @@ def _create_mock_stream(self, name: str, block_simultaneous_read: str = ""): def _create_mock_stream_with_parent( self, name: str, parent_stream, block_simultaneous_read: str = "" ): - """Helper to create a mock stream with a parent stream""" - stream = self._create_mock_stream(name, block_simultaneous_read) + """Helper to create a mock stream with a parent stream.""" + stream = Mock(spec=DefaultStream) + stream.name = name + stream.block_simultaneous_read = block_simultaneous_read + stream.as_airbyte_stream.return_value = AirbyteStream( + name=name, + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + stream.cursor.ensure_at_least_one_state_emitted = Mock() - # Mock the retriever and partition router for parent relationship - mock_retriever = Mock() - mock_partition_router = Mock() + mock_partition_router = Mock(spec=SubstreamPartitionRouter) mock_parent_config = Mock() mock_parent_config.stream = parent_stream - mock_partition_router.parent_stream_configs = [mock_parent_config] - mock_retriever.partition_router = mock_partition_router - stream.retriever = mock_retriever + stream.get_partition_router.return_value = mock_partition_router return stream @@ -1396,3 +1404,39 @@ def test_child_starts_after_parent_completes_via_partition_complete_sentinel(sel ] assert len(started_messages) == 1 assert started_messages[0].trace.stream_status.stream_descriptor.name == "child" + + +def test_is_done_raises_when_partition_generation_queue_not_empty(): + """Test is_done raises AirbyteTracedException if streams remain in the partition generation queue.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + stream = Mock(spec=AbstractStream) + stream.name = "stuck_stream" + stream.block_simultaneous_read = "" + stream.as_airbyte_stream.return_value = AirbyteStream( + name="stuck_stream", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + handler = ConcurrentReadProcessor( + [stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + # Artificially mark the stream as done without removing it from the partition generation queue + handler._streams_done.add("stuck_stream") + + with pytest.raises(AirbyteTracedException, match="remained in the partition generation queue"): + handler.is_done() From 48711d3994ef1c9f3caf3404b39e84d93d059fd9 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:14:55 +0000 Subject: [PATCH 21/31] refactor: move inline imports to module level in default_stream.py and concurrent_declarative_source.py Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 7 +++---- .../streams/concurrent/default_stream.py | 21 ++++++++----------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 15fa40fe4..5ae1d232d 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -76,6 +76,9 @@ from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) +from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, +) from airbyte_cdk.sources.declarative.resolvers import COMPONENTS_RESOLVER_TYPE_MAPPING from airbyte_cdk.sources.declarative.spec.spec import Spec from airbyte_cdk.sources.declarative.types import Config, ConnectionDefinition @@ -433,10 +436,6 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: against actual created stream instances by name. Validates that no stream shares a group with any of its parent streams, which would cause a deadlock. """ - from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( - SubstreamPartitionRouter, - ) - stream_groups = self._source_config.get("stream_groups", {}) if not stream_groups: return diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 4c7267315..6cc6e44d4 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -3,12 +3,16 @@ # from logging import Logger -from typing import TYPE_CHECKING, Any, Callable, Iterable, List, Mapping, Optional, Union - -if TYPE_CHECKING: - from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from typing import Any, Callable, Iterable, List, Mapping, Optional, Union from airbyte_cdk.models import AirbyteStream, SyncMode +from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentPerPartitionCursor, +) +from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + StreamSlicerPartitionGenerator, +) from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField @@ -108,15 +112,8 @@ def block_simultaneous_read(self) -> str: def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value - def get_partition_router(self) -> "PartitionRouter | None": + def get_partition_router(self) -> PartitionRouter | None: """Return the partition router for this stream, or None if not available.""" - from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( - ConcurrentPerPartitionCursor, - ) - from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( - StreamSlicerPartitionGenerator, - ) - if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer From 5e4ed3868dd91d61b8d71908ff1d05efff816057 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:26:23 +0000 Subject: [PATCH 22/31] fix: unwrap GroupingPartitionRouter in get_partition_router() to detect parent streams Co-Authored-By: unknown <> --- .../sources/streams/concurrent/default_stream.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index 6cc6e44d4..a896cfbad 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -9,6 +9,9 @@ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( ConcurrentPerPartitionCursor, ) +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, @@ -113,13 +116,20 @@ def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value def get_partition_router(self) -> PartitionRouter | None: - """Return the partition router for this stream, or None if not available.""" + """Return the partition router for this stream, or None if not available. + + If the router is a GroupingPartitionRouter, unwraps it to return the + underlying router so callers can inspect parent stream relationships. + """ if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): return None - return stream_slicer._partition_router + router = stream_slicer._partition_router + if isinstance(router, GroupingPartitionRouter): + return router.underlying_partition_router + return router def check_availability(self) -> StreamAvailability: """ From 0e710d272a8b76f1a64f274dcb353122cd5f0dd7 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:29:42 +0000 Subject: [PATCH 23/31] fix: handle GroupingPartitionRouter at call sites instead of in get_partition_router() Co-Authored-By: unknown <> --- .../concurrent_source/concurrent_read_processor.py | 5 +++++ .../declarative/concurrent_declarative_source.py | 5 +++++ .../sources/streams/concurrent/default_stream.py | 14 ++------------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 9a8f7d942..53ddf4994 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -13,6 +13,9 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -398,6 +401,8 @@ def _collect_all_parent_stream_names(self, stream_name: str) -> Set[str]: partition_router = ( stream.get_partition_router() if isinstance(stream, DefaultStream) else None ) + if isinstance(partition_router, GroupingPartitionRouter): + partition_router = partition_router.underlying_partition_router if isinstance(partition_router, SubstreamPartitionRouter): for parent_config in partition_router.parent_stream_configs: diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 5ae1d232d..6d3b570ab 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -76,6 +76,9 @@ from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import ( ModelToComponentFactory, ) +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -454,6 +457,8 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: continue partition_router = stream.get_partition_router() + if isinstance(partition_router, GroupingPartitionRouter): + partition_router = partition_router.underlying_partition_router if not isinstance(partition_router, SubstreamPartitionRouter): continue group_name = stream_name_to_group[stream.name] diff --git a/airbyte_cdk/sources/streams/concurrent/default_stream.py b/airbyte_cdk/sources/streams/concurrent/default_stream.py index a896cfbad..6cc6e44d4 100644 --- a/airbyte_cdk/sources/streams/concurrent/default_stream.py +++ b/airbyte_cdk/sources/streams/concurrent/default_stream.py @@ -9,9 +9,6 @@ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( ConcurrentPerPartitionCursor, ) -from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( - GroupingPartitionRouter, -) from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( StreamSlicerPartitionGenerator, @@ -116,20 +113,13 @@ def block_simultaneous_read(self, value: str) -> None: self._block_simultaneous_read = value def get_partition_router(self) -> PartitionRouter | None: - """Return the partition router for this stream, or None if not available. - - If the router is a GroupingPartitionRouter, unwraps it to return the - underlying router so callers can inspect parent stream relationships. - """ + """Return the partition router for this stream, or None if not available.""" if not isinstance(self._stream_partition_generator, StreamSlicerPartitionGenerator): return None stream_slicer = self._stream_partition_generator._stream_slicer if not isinstance(stream_slicer, ConcurrentPerPartitionCursor): return None - router = stream_slicer._partition_router - if isinstance(router, GroupingPartitionRouter): - return router.underlying_partition_router - return router + return stream_slicer._partition_router def check_availability(self) -> StreamAvailability: """ From a0fbd891b71f89343bba1780049edf6fa6d0ebe5 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:34:21 +0000 Subject: [PATCH 24/31] feat: check active_groups is empty in is_done() safety check Co-Authored-By: unknown <> --- .../sources/concurrent_source/concurrent_read_processor.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py index 53ddf4994..cc39f888f 100644 --- a/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +++ b/airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py @@ -370,6 +370,12 @@ def is_done(self) -> bool: internal_message=f"Streams {stuck_stream_names} remained in the partition generation queue after all streams were marked done.", failure_type=FailureType.system_error, ) + if is_done and self._active_groups: + raise AirbyteTracedException( + message="Active stream groups are not empty after all streams completed.", + internal_message=f"Groups {dict(self._active_groups)} still active after all streams were marked done.", + failure_type=FailureType.system_error, + ) if is_done and self._exceptions_per_stream_name: error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name) self._logger.info(error_message) From d8ef4fa00dc90c24fb85525dbfc06b9c9299759b Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:38:37 +0000 Subject: [PATCH 25/31] test: add missing unit tests for GroupingPartitionRouter, active_groups check, and get_partition_router Co-Authored-By: unknown <> --- .../test_concurrent_declarative_source.py | 144 ++++++++++++++++++ .../test_concurrent_read_processor.py | 85 +++++++++++ 2 files changed, 229 insertions(+) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index f1b4143a3..ca8e474f5 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5351,3 +5351,147 @@ def test_apply_stream_groups_allows_parent_child_in_different_groups(): assert parent.block_simultaneous_read == "group_a" assert child.block_simultaneous_read == "group_b" + + +def _make_child_stream_with_grouping_router( + child_name: str, parent_stream: DefaultStream +) -> DefaultStream: + """Create a DefaultStream with GroupingPartitionRouter wrapping SubstreamPartitionRouter.""" + from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ( + ConcurrentCursorFactory, + ConcurrentPerPartitionCursor, + ) + from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + ParentStreamConfig, + SubstreamPartitionRouter, + ) + from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartitionFactory, + StreamSlicerPartitionGenerator, + ) + from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor + from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + EpochValueConcurrentStreamStateConverter, + ) + + substream_router = SubstreamPartitionRouter( + parent_stream_configs=[ + ParentStreamConfig( + stream=parent_stream, + parent_key="id", + partition_field="parent_id", + config={}, + parameters={}, + ) + ], + config={}, + parameters={}, + ) + + grouping_router = GroupingPartitionRouter( + group_size=10, + underlying_partition_router=substream_router, + config={}, + ) + + cursor_factory = ConcurrentCursorFactory(lambda *args, **kwargs: Mock()) + message_repository = InMemoryMessageRepository() + state_converter = EpochValueConcurrentStreamStateConverter() + + per_partition_cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory, + partition_router=grouping_router, + stream_name=child_name, + stream_namespace=None, + stream_state={}, + message_repository=message_repository, + connector_state_manager=Mock(), + connector_state_converter=state_converter, + cursor_field=Mock(cursor_field_key="updated_at"), + ) + + partition_factory = Mock(spec=DeclarativePartitionFactory) + partition_generator = StreamSlicerPartitionGenerator( + partition_factory=partition_factory, + stream_slicer=per_partition_cursor, + ) + + cursor = FinalStateCursor( + stream_name=child_name, stream_namespace=None, message_repository=message_repository + ) + return DefaultStream( + partition_generator=partition_generator, + name=child_name, + json_schema={}, + primary_key=[], + cursor_field=None, + logger=logging.getLogger(f"test.{child_name}"), + cursor=cursor, + ) + + +def test_apply_stream_groups_raises_on_parent_child_in_same_group_with_grouping_router(): + """Test _apply_stream_groups detects deadlock when GroupingPartitionRouter wraps SubstreamPartitionRouter.""" + parent = _make_default_stream("parent_stream") + child = _make_child_stream_with_grouping_router("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "parent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [parent, child]) + + +@pytest.mark.parametrize( + "stream_factory,expected_type", + [ + pytest.param( + lambda: _make_default_stream("plain_stream"), + type(None), + id="no_partition_router_returns_none", + ), + pytest.param( + lambda: _make_child_stream_with_parent("child", _make_default_stream("parent")), + "SubstreamPartitionRouter", + id="substream_returns_substream_router", + ), + pytest.param( + lambda: _make_child_stream_with_grouping_router( + "child", _make_default_stream("parent") + ), + "GroupingPartitionRouter", + id="grouping_returns_grouping_router", + ), + ], +) +def test_get_partition_router(stream_factory, expected_type): + """Test DefaultStream.get_partition_router returns the correct router type.""" + from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, + ) + from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( + SubstreamPartitionRouter, + ) + + stream = stream_factory() + router = stream.get_partition_router() + + if expected_type is type(None): + assert router is None + elif expected_type == "SubstreamPartitionRouter": + assert isinstance(router, SubstreamPartitionRouter) + elif expected_type == "GroupingPartitionRouter": + assert isinstance(router, GroupingPartitionRouter) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index 4cdc72d45..acfa03129 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -28,6 +28,9 @@ ) from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager +from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import ( + GroupingPartitionRouter, +) from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import ( SubstreamPartitionRouter, ) @@ -1440,3 +1443,85 @@ def test_is_done_raises_when_partition_generation_queue_not_empty(): with pytest.raises(AirbyteTracedException, match="remained in the partition generation queue"): handler.is_done() + + +def test_is_done_raises_when_active_groups_not_empty(): + """Test is_done raises AirbyteTracedException if active groups remain after all streams complete.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + stream = Mock(spec=AbstractStream) + stream.name = "stuck_stream" + stream.block_simultaneous_read = "my_group" + stream.as_airbyte_stream.return_value = AirbyteStream( + name="stuck_stream", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh], + ) + + handler = ConcurrentReadProcessor( + [stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + # Mark stream as done but leave the group active (simulating a bug) + handler._streams_done.add("stuck_stream") + handler._stream_instances_to_start_partition_generation.clear() + handler._active_groups["my_group"] = {"stuck_stream"} + + with pytest.raises( + AirbyteTracedException, match="still active after all streams were marked done" + ): + handler.is_done() + + +def test_collect_parent_stream_names_unwraps_grouping_partition_router(): + """Test _collect_all_parent_stream_names unwraps GroupingPartitionRouter to find parents.""" + partition_enqueuer = Mock(spec=PartitionEnqueuer) + thread_pool_manager = Mock(spec=ThreadPoolManager) + logger = Mock(spec=logging.Logger) + slice_logger = Mock(spec=SliceLogger) + message_repository = Mock(spec=MessageRepository) + message_repository.consume_queue.return_value = [] + partition_reader = Mock(spec=PartitionReader) + + parent_stream = Mock(spec=AbstractStream) + parent_stream.name = "parent" + parent_stream.block_simultaneous_read = "" + + # Child has a GroupingPartitionRouter wrapping a SubstreamPartitionRouter + child_stream = Mock(spec=DefaultStream) + child_stream.name = "child" + child_stream.block_simultaneous_read = "" + + mock_substream_router = Mock(spec=SubstreamPartitionRouter) + mock_parent_config = Mock() + mock_parent_config.stream = parent_stream + mock_substream_router.parent_stream_configs = [mock_parent_config] + + mock_grouping_router = Mock(spec=GroupingPartitionRouter) + mock_grouping_router.underlying_partition_router = mock_substream_router + child_stream.get_partition_router.return_value = mock_grouping_router + + handler = ConcurrentReadProcessor( + [parent_stream, child_stream], + partition_enqueuer, + thread_pool_manager, + logger, + slice_logger, + message_repository, + partition_reader, + ) + + parent_names = handler._collect_all_parent_stream_names("child") + assert parent_names == {"parent"} From f0003132fde30b7fe2aac520dc818104fb5f5ef2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 4 Mar 2026 20:04:25 +0000 Subject: [PATCH 26/31] fix: make deadlock validation check all ancestors, not just direct parents Co-Authored-By: unknown <> --- .../concurrent_declarative_source.py | 33 +++++++++++++------ .../test_concurrent_declarative_source.py | 23 +++++++++++++ 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py index 6d3b570ab..292615692 100644 --- a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py +++ b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py @@ -452,21 +452,34 @@ def _apply_stream_groups(self, streams: List[AbstractStream]) -> None: if stream_name: stream_name_to_group[stream_name] = group_name - # Validate no stream shares a group with its parent streams + # Validate no stream shares a group with any of its ancestor streams + stream_name_to_instance: Dict[str, AbstractStream] = {s.name: s for s in streams} + + def _collect_all_ancestor_names(stream_name: str) -> Set[str]: + """Recursively collect all ancestor stream names.""" + ancestors: Set[str] = set() + inst = stream_name_to_instance.get(stream_name) + if not isinstance(inst, DefaultStream): + return ancestors + router = inst.get_partition_router() + if isinstance(router, GroupingPartitionRouter): + router = router.underlying_partition_router + if not isinstance(router, SubstreamPartitionRouter): + return ancestors + for parent_config in router.parent_stream_configs: + parent_name = parent_config.stream.name + ancestors.add(parent_name) + ancestors.update(_collect_all_ancestor_names(parent_name)) + return ancestors + for stream in streams: if not isinstance(stream, DefaultStream) or stream.name not in stream_name_to_group: continue - partition_router = stream.get_partition_router() - if isinstance(partition_router, GroupingPartitionRouter): - partition_router = partition_router.underlying_partition_router - if not isinstance(partition_router, SubstreamPartitionRouter): - continue group_name = stream_name_to_group[stream.name] - for parent_config in partition_router.parent_stream_configs: - parent_name = parent_config.stream.name - if stream_name_to_group.get(parent_name) == group_name: + for ancestor_name in _collect_all_ancestor_names(stream.name): + if stream_name_to_group.get(ancestor_name) == group_name: raise ValueError( - f"Stream '{stream.name}' and its parent stream '{parent_name}' " + f"Stream '{stream.name}' and its parent stream '{ancestor_name}' " f"are both in group '{group_name}'. " f"A child stream must not share a group with its parent to avoid deadlock." ) diff --git a/unit_tests/sources/declarative/test_concurrent_declarative_source.py b/unit_tests/sources/declarative/test_concurrent_declarative_source.py index ca8e474f5..bf1f61610 100644 --- a/unit_tests/sources/declarative/test_concurrent_declarative_source.py +++ b/unit_tests/sources/declarative/test_concurrent_declarative_source.py @@ -5433,6 +5433,29 @@ def _make_child_stream_with_grouping_router( ) +def test_apply_stream_groups_raises_on_grandparent_child_in_same_group(): + """Test _apply_stream_groups detects deadlock when a grandchild and grandparent share a group.""" + grandparent = _make_default_stream("grandparent_stream") + parent = _make_child_stream_with_parent("parent_stream", grandparent) + child = _make_child_stream_with_parent("child_stream", parent) + + source = Mock() + source._source_config = { + "stream_groups": { + "my_group": { + "streams": [ + {"name": "grandparent_stream", "type": "DeclarativeStream"}, + {"name": "child_stream", "type": "DeclarativeStream"}, + ], + "action": {"type": "BlockSimultaneousSyncsAction"}, + } + } + } + + with pytest.raises(ValueError, match="child stream must not share a group with its parent"): + ConcurrentDeclarativeSource._apply_stream_groups(source, [grandparent, parent, child]) + + def test_apply_stream_groups_raises_on_parent_child_in_same_group_with_grouping_router(): """Test _apply_stream_groups detects deadlock when GroupingPartitionRouter wraps SubstreamPartitionRouter.""" parent = _make_default_stream("parent_stream") From 68850ec33edb4977dc84f27df19445fb25525541 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:07:57 +0000 Subject: [PATCH 27/31] style: alphabetize StreamGroup and BlockSimultaneousSyncsAction in schema definitions Co-Authored-By: unknown <> --- .../declarative_component_schema.yaml | 74 +++++++++---------- .../models/declarative_component_schema.py | 34 ++++----- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 8d0d8d4fa..0f357e9c6 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -72,43 +72,6 @@ properties: description: A description of the connector. It will be presented on the Source documentation page. additionalProperties: false definitions: - StreamGroup: - title: Stream Group - description: > - A group of streams that share a common resource and should not be read simultaneously. - Streams in the same group will be blocked from concurrent reads based on the specified action. - type: object - required: - - streams - - action - properties: - streams: - title: Streams - description: > - List of references to streams that belong to this group. - type: array - items: - anyOf: - - "$ref": "#/definitions/DeclarativeStream" - action: - title: Action - description: The action to apply to streams in this group. - "$ref": "#/definitions/BlockSimultaneousSyncsAction" - BlockSimultaneousSyncsAction: - title: Block Simultaneous Syncs Action - description: > - Action that prevents streams in the same group from being read concurrently. - When applied to a stream group, streams with this action will be deferred if - another stream in the same group is currently active. - This is useful for APIs that don't allow concurrent access to the same - endpoint or session. Only applies to ConcurrentDeclarativeSource. - type: object - required: - - type - properties: - type: - type: string - enum: [BlockSimultaneousSyncsAction] AddedFieldDefinition: title: Definition Of Field To Add description: Defines the field to add on a record. @@ -312,6 +275,21 @@ definitions: $parameters: type: object additionalProperties: true + BlockSimultaneousSyncsAction: + title: Block Simultaneous Syncs Action + description: > + Action that prevents streams in the same group from being read concurrently. + When applied to a stream group, streams with this action will be deferred if + another stream in the same group is currently active. + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: object + required: + - type + properties: + type: + type: string + enum: [BlockSimultaneousSyncsAction] SelectiveAuthenticator: title: Selective Authenticator description: Authenticator that selects concrete authenticator based on config property. @@ -4172,6 +4150,28 @@ definitions: - "$ref": "#/definitions/ConfigRemoveFields" - "$ref": "#/definitions/CustomConfigTransformation" default: [] + StreamGroup: + title: Stream Group + description: > + A group of streams that share a common resource and should not be read simultaneously. + Streams in the same group will be blocked from concurrent reads based on the specified action. + type: object + required: + - streams + - action + properties: + streams: + title: Streams + description: > + List of references to streams that belong to this group. + type: array + items: + anyOf: + - "$ref": "#/definitions/DeclarativeStream" + action: + title: Action + description: The action to apply to streams in this group. + "$ref": "#/definitions/BlockSimultaneousSyncsAction" SubstreamPartitionRouter: title: Substream Partition Router description: Partition router that is used to retrieve records that have been partitioned according to records from the specified parent streams. An example of a parent stream is automobile brands and the substream would be the various car models associated with each branch. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 29a2c7632..6258c895e 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -46,6 +46,10 @@ class BearerAuthenticator(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class BlockSimultaneousSyncsAction(BaseModel): + type: Literal["BlockSimultaneousSyncsAction"] + + class DynamicStreamCheckConfig(BaseModel): type: Literal["DynamicStreamCheckConfig"] dynamic_stream_name: str = Field( @@ -2317,23 +2321,6 @@ class Config: ) -class BlockSimultaneousSyncsAction(BaseModel): - type: Literal["BlockSimultaneousSyncsAction"] - - -class StreamGroup(BaseModel): - streams: List[str] = Field( - ..., - description='List of references to streams that belong to this group. Use JSON references to stream definitions (e.g., "#/definitions/my_stream").', - title="Streams", - ) - action: BlockSimultaneousSyncsAction = Field( - ..., - description="The action to apply to streams in this group.", - title="Action", - ) - - class Spec(BaseModel): type: Literal["Spec"] connection_specification: Dict[str, Any] = Field( @@ -3080,6 +3067,19 @@ class AsyncRetriever(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class StreamGroup(BaseModel): + streams: List[str] = Field( + ..., + description='List of references to streams that belong to this group. Use JSON references to stream definitions (e.g., "#/definitions/my_stream").', + title="Streams", + ) + action: BlockSimultaneousSyncsAction = Field( + ..., + description="The action to apply to streams in this group.", + title="Action", + ) + + class SubstreamPartitionRouter(BaseModel): type: Literal["SubstreamPartitionRouter"] parent_stream_configs: List[ParentStreamConfig] = Field( From b183f80558bd5961f399aa9fa293aaa1c016da9c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:16:12 +0000 Subject: [PATCH 28/31] style: move BlockSimultaneousSyncsAction next to StreamGroup for easier reading Co-Authored-By: unknown <> --- .../declarative_component_schema.yaml | 30 +++++++++---------- .../models/declarative_component_schema.py | 8 ++--- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 0f357e9c6..df972fe71 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -275,21 +275,6 @@ definitions: $parameters: type: object additionalProperties: true - BlockSimultaneousSyncsAction: - title: Block Simultaneous Syncs Action - description: > - Action that prevents streams in the same group from being read concurrently. - When applied to a stream group, streams with this action will be deferred if - another stream in the same group is currently active. - This is useful for APIs that don't allow concurrent access to the same - endpoint or session. Only applies to ConcurrentDeclarativeSource. - type: object - required: - - type - properties: - type: - type: string - enum: [BlockSimultaneousSyncsAction] SelectiveAuthenticator: title: Selective Authenticator description: Authenticator that selects concrete authenticator based on config property. @@ -4172,6 +4157,21 @@ definitions: title: Action description: The action to apply to streams in this group. "$ref": "#/definitions/BlockSimultaneousSyncsAction" + BlockSimultaneousSyncsAction: + title: Block Simultaneous Syncs Action + description: > + Action that prevents streams in the same group from being read concurrently. + When applied to a stream group, streams with this action will be deferred if + another stream in the same group is currently active. + This is useful for APIs that don't allow concurrent access to the same + endpoint or session. Only applies to ConcurrentDeclarativeSource. + type: object + required: + - type + properties: + type: + type: string + enum: [BlockSimultaneousSyncsAction] SubstreamPartitionRouter: title: Substream Partition Router description: Partition router that is used to retrieve records that have been partitioned according to records from the specified parent streams. An example of a parent stream is automobile brands and the substream would be the various car models associated with each branch. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 6258c895e..96451365c 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -46,10 +46,6 @@ class BearerAuthenticator(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") -class BlockSimultaneousSyncsAction(BaseModel): - type: Literal["BlockSimultaneousSyncsAction"] - - class DynamicStreamCheckConfig(BaseModel): type: Literal["DynamicStreamCheckConfig"] dynamic_stream_name: str = Field( @@ -3067,6 +3063,10 @@ class AsyncRetriever(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class BlockSimultaneousSyncsAction(BaseModel): + type: Literal["BlockSimultaneousSyncsAction"] + + class StreamGroup(BaseModel): streams: List[str] = Field( ..., From e1bd626e6eae4c436674d950ecad2ca3e8b10c81 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 22:28:09 +0000 Subject: [PATCH 29/31] fix: add default HTTP request timeout to prevent indefinite hangs Add default timeout of (30s connect, 300s read) to HttpClient.send_request(). When no explicit timeout is provided in request_kwargs, the default is injected before sending the request. This prevents requests.Session.send() from blocking indefinitely when a server stalls mid-response (e.g. after a 500 error retry). ConnectTimeout and ReadTimeout are already in TRANSIENT_EXCEPTIONS, so timeouts trigger automatic retries with exponential backoff. Co-Authored-By: alfredo.garcia@airbyte.io --- .../sources/streams/http/http_client.py | 12 ++++- .../sources/streams/http/test_http_client.py | 49 +++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/streams/http/http_client.py b/airbyte_cdk/sources/streams/http/http_client.py index 3a0a62739..a2dd67c46 100644 --- a/airbyte_cdk/sources/streams/http/http_client.py +++ b/airbyte_cdk/sources/streams/http/http_client.py @@ -85,6 +85,8 @@ def monkey_patched_get_item(self, key): # type: ignore # this interface is a co class HttpClient: _DEFAULT_MAX_RETRY: int = 5 _DEFAULT_MAX_TIME: int = 60 * 10 + _DEFAULT_CONNECT_TIMEOUT: int = 30 + _DEFAULT_READ_TIMEOUT: int = 300 _ACTIONS_TO_RETRY_ON = { ResponseAction.RETRY, ResponseAction.RATE_LIMITED, @@ -586,11 +588,17 @@ def send_request( verify=request_kwargs.get("verify"), cert=request_kwargs.get("cert"), ) - request_kwargs = {**request_kwargs, **env_settings} + mutable_request_kwargs: Dict[str, Any] = {**request_kwargs, **env_settings} + + if "timeout" not in mutable_request_kwargs: + mutable_request_kwargs["timeout"] = ( + self._DEFAULT_CONNECT_TIMEOUT, + self._DEFAULT_READ_TIMEOUT, + ) response: requests.Response = self._send_with_retry( request=request, - request_kwargs=request_kwargs, + request_kwargs=mutable_request_kwargs, log_formatter=log_formatter, exit_on_rate_limit=exit_on_rate_limit, ) diff --git a/unit_tests/sources/streams/http/test_http_client.py b/unit_tests/sources/streams/http/test_http_client.py index ea245c2fb..1d30aa21c 100644 --- a/unit_tests/sources/streams/http/test_http_client.py +++ b/unit_tests/sources/streams/http/test_http_client.py @@ -1059,3 +1059,52 @@ def update_response(*args, **kwargs): assert mock_authenticator.access_token == "new_refreshed_token" assert returned_response == valid_response assert call_count == 2 + + +def test_send_request_applies_default_timeout_when_not_provided(mocker): + http_client = test_http_client() + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 200 + mocked_response.headers = {} + mock_send = mocker.patch.object(requests.Session, "send", return_value=mocked_response) + + http_client.send_request( + http_method="get", + url="https://test_base_url.com/v1/endpoint", + request_kwargs={}, + ) + + assert mock_send.call_count == 1 + call_kwargs = mock_send.call_args + # The timeout should be passed as part of the keyword arguments to session.send() + # session.send(request, **request_kwargs) unpacks request_kwargs, so timeout appears as a kwarg + assert call_kwargs.kwargs.get("timeout") == ( + HttpClient._DEFAULT_CONNECT_TIMEOUT, + HttpClient._DEFAULT_READ_TIMEOUT, + ) or call_kwargs[1].get("timeout") == ( + HttpClient._DEFAULT_CONNECT_TIMEOUT, + HttpClient._DEFAULT_READ_TIMEOUT, + ) + + +def test_send_request_respects_explicit_timeout(mocker): + http_client = test_http_client() + mocked_response = MagicMock(spec=requests.Response) + mocked_response.status_code = 200 + mocked_response.headers = {} + mock_send = mocker.patch.object(requests.Session, "send", return_value=mocked_response) + + custom_timeout = (10, 60) + http_client.send_request( + http_method="get", + url="https://test_base_url.com/v1/endpoint", + request_kwargs={"timeout": custom_timeout}, + ) + + assert mock_send.call_count == 1 + call_kwargs = mock_send.call_args + # The explicit timeout should be preserved, not overridden by the default + assert ( + call_kwargs.kwargs.get("timeout") == custom_timeout + or call_kwargs[1].get("timeout") == custom_timeout + ) From 29951b22be767748311af973d4a75bf3d7074d35 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 22:44:17 +0000 Subject: [PATCH 30/31] fix: match test assertions against AirbyteTracedException message field The pytest.raises(match=...) regex matches against the exception's string representation, which uses the 'message' field, not 'internal_message'. Updated the two is_done safety check tests to match against the correct message text. Co-Authored-By: alfredo.garcia@airbyte.io --- .../streams/concurrent/test_concurrent_read_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index acfa03129..e2df66b37 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1441,7 +1441,7 @@ def test_is_done_raises_when_partition_generation_queue_not_empty(): # Artificially mark the stream as done without removing it from the partition generation queue handler._streams_done.add("stuck_stream") - with pytest.raises(AirbyteTracedException, match="remained in the partition generation queue"): + with pytest.raises(AirbyteTracedException, match="Partition generation queue is not empty"): handler.is_done() @@ -1480,7 +1480,7 @@ def test_is_done_raises_when_active_groups_not_empty(): handler._active_groups["my_group"] = {"stuck_stream"} with pytest.raises( - AirbyteTracedException, match="still active after all streams were marked done" + AirbyteTracedException, match="Active stream groups are not empty" ): handler.is_done() From b0537da95b9e25a70bcbf6e3cafe2d7b88e23907 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 22:46:01 +0000 Subject: [PATCH 31/31] style: fix ruff format in test_concurrent_read_processor.py Co-Authored-By: alfredo.garcia@airbyte.io --- .../streams/concurrent/test_concurrent_read_processor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py index e2df66b37..d4755c5a0 100644 --- a/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +++ b/unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py @@ -1479,9 +1479,7 @@ def test_is_done_raises_when_active_groups_not_empty(): handler._stream_instances_to_start_partition_generation.clear() handler._active_groups["my_group"] = {"stuck_stream"} - with pytest.raises( - AirbyteTracedException, match="Active stream groups are not empty" - ): + with pytest.raises(AirbyteTracedException, match="Active stream groups are not empty"): handler.is_done()