From 2bb4e8eee2d7eb1440632bdf2a3c93d395ef97bc Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 18 Jun 2026 07:09:05 -0400 Subject: [PATCH 1/5] Fix IndexError in changefeed when listing directory marker blobs --- .../CHANGELOG.md | 5 +++++ .../azure/storage/blob/changefeed/_models.py | 22 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/sdk/storage/azure-storage-blob-changefeed/CHANGELOG.md b/sdk/storage/azure-storage-blob-changefeed/CHANGELOG.md index a4c79c82550c..7f7946025f93 100644 --- a/sdk/storage/azure-storage-blob-changefeed/CHANGELOG.md +++ b/sdk/storage/azure-storage-blob-changefeed/CHANGELOG.md @@ -6,6 +6,11 @@ This version and all future versions will require Python 3.9+. Python 3.8 is no ### Features Added +### Bugs Fixed +- Fixed an `IndexError` that occurred when listing change feed events on accounts where the +`$blobchangefeed/idx/segments/` hierarchy contains directory marker blobs (e.g. +`idx/segments/2026/02/20`). Such non-segment paths are now skipped instead of being parsed +as segment files. ## 12.0.0b5 (2024-04-16) diff --git a/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py b/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py index fe3fc1b4378e..dd8a5a5da31a 100644 --- a/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py +++ b/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py @@ -282,7 +282,10 @@ def _get_segment_paths(self, start_year=""): while not start_year or start_year <= cur_year: paths = self.client.list_blobs(name_starts_with=SEGMENT_COMMON_PATH + str(start_year)) for path in paths: - yield path.name + # Skip directory marker blobs that does not conform to the expected segment path shape. + # Azure Storage can return zero-length directory markers that are not real segment files. + if self._is_valid_segment_path(path.name): + yield path.name # if not searching by prefix, all paths would have been iterated already, so it"s time to yield None if not start_year: @@ -291,6 +294,23 @@ def _get_segment_paths(self, start_year=""): start_year += 1 yield None + @staticmethod + def _is_valid_segment_path(segment_path): + # A valid segment path is of the form "idx/segments/YYYY/MM/DD/HHMM/". + # Directory marker blobs (e.g. "idx/segments/2026/02/20") have too few tokens to + # represent a segment and must be skipped to avoid an IndexError while parsing. + path_tokens = segment_path.split(PATH_DELIMITER) + if len(path_tokens) < 6: + return False + try: + int(path_tokens[2]) # year + int(path_tokens[3]) # month + int(path_tokens[4]) # day + int(path_tokens[5][:2]) # hour (from HHMM) + except (ValueError, IndexError): + return False + return True + @staticmethod def _parse_datetime_from_segment_path(segment_path): path_tokens = segment_path.split("/") From 082ebb58df388aa3fce1e5ecfed28341933d1e8d Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 18 Jun 2026 15:30:31 -0400 Subject: [PATCH 2/5] Copilot feedback --- .../azure/storage/blob/changefeed/_models.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py b/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py index dd8a5a5da31a..390195e1666a 100644 --- a/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py +++ b/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py @@ -282,7 +282,7 @@ def _get_segment_paths(self, start_year=""): while not start_year or start_year <= cur_year: paths = self.client.list_blobs(name_starts_with=SEGMENT_COMMON_PATH + str(start_year)) for path in paths: - # Skip directory marker blobs that does not conform to the expected segment path shape. + # Skip directory marker blobs that do not conform to the expected segment path shape. # Azure Storage can return zero-length directory markers that are not real segment files. if self._is_valid_segment_path(path.name): yield path.name @@ -296,11 +296,10 @@ def _get_segment_paths(self, start_year=""): @staticmethod def _is_valid_segment_path(segment_path): - # A valid segment path is of the form "idx/segments/YYYY/MM/DD/HHMM/". - # Directory marker blobs (e.g. "idx/segments/2026/02/20") have too few tokens to - # represent a segment and must be skipped to avoid an IndexError while parsing. path_tokens = segment_path.split(PATH_DELIMITER) - if len(path_tokens) < 6: + + # Expected: idx/segments/YYYY/MM/DD/HHMM/ + if len(path_tokens) < 7 or not path_tokens[6]: return False try: int(path_tokens[2]) # year From fbfd2a881adc23daa060b4db1d46a2337f2b7696 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 18 Jun 2026 15:39:45 -0400 Subject: [PATCH 3/5] Test - in progress --- .../tests/test_change_feed.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py b/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py index f3102b49057d..8e96391d4fe3 100644 --- a/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py +++ b/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py @@ -8,6 +8,8 @@ from datetime import datetime, timedelta from math import ceil from time import sleep +from types import SimpleNamespace +from unittest.mock import Mock import pytest @@ -23,11 +25,80 @@ # Then uncomment this import and comment out the other. # from changefeed import ChangeFeedClient from azure.storage.blob.changefeed import ChangeFeedClient +from azure.storage.blob.changefeed._models import ChangeFeed + + +def _container_client_returning(blob_names): + """A mock ContainerClient whose list_blobs() yields blobs with the given names.""" + client = Mock() + client.list_blobs.return_value = [SimpleNamespace(name=name) for name in blob_names] + return client @pytest.mark.playback_test_only class TestStorageChangeFeed(StorageRecordedTestCase): + @pytest.mark.parametrize( + "segment_path", + [ + "idx/segments/2026/02/20/0000/meta.json", + "idx/segments/2022/11/28/2300/meta.json", + "idx/segments/1601/01/01/0000/meta.json", + ], + ) + def test_valid_segment_path_is_accepted(self, segment_path): + assert ChangeFeed._is_valid_segment_path(segment_path) is True + + @pytest.mark.parametrize( + "segment_path", + [ + "idx/segments/2026/02/20", # day-level directory marker (the reported crash) + "idx/segments/2026/02/20/0000", # minute-level directory marker + "idx/segments/2026/02", # month-level directory marker + "idx/segments/2026", # year-level directory marker + "idx/segments", # prefix only + "idx/segments/2026/02/20/0000/", # trailing slash -> empty file token + "idx/segments/abcd/02/20/0000/meta.json", # non-numeric year + ], + ) + def test_directory_marker_or_malformed_path_is_rejected(self, segment_path): + assert ChangeFeed._is_valid_segment_path(segment_path) is False + + def test_parse_datetime_from_valid_segment_path(self): + assert ChangeFeed._parse_datetime_from_segment_path( + "idx/segments/2026/02/20/0000/meta.json" + ) == datetime(2026, 2, 20, 0) + + def test_get_segment_paths_skips_directory_markers(self): + blob_names = [ + "idx/segments/2026/02/20", # day-level marker + "idx/segments/2026/02/20/0000", # minute-level marker + "idx/segments/2026/02/20/0000/meta.json", # real segment + "idx/segments/2026/02/20/0100/meta.json", # real segment + ] + change_feed = ChangeFeed.__new__(ChangeFeed) + change_feed.client = _container_client_returning(blob_names) + change_feed.end_time = None + + results = list(change_feed._get_segment_paths(start_year="")) + + # The generator yields a trailing None sentinel to signal "no more segments". + assert results[-1] is None + yielded_segments = [path for path in results if path is not None] + assert yielded_segments == [ + "idx/segments/2026/02/20/0000/meta.json", + "idx/segments/2026/02/20/0100/meta.json", + ] + + def test_get_segment_paths_does_not_raise_on_directory_markers(self): + blob_names = ["idx/segments/2026/02/20", "idx/segments/2026/02/20/0000/meta.json"] + change_feed = ChangeFeed.__new__(ChangeFeed) + change_feed.client = _container_client_returning(blob_names) + change_feed.end_time = None + + yielded_segments = [path for path in change_feed._get_segment_paths(start_year="") if path] + assert yielded_segments == ["idx/segments/2026/02/20/0000/meta.json"] + # --Test cases for change feed ----------------------------------------- @ChangeFeedPreparer() @recorded_by_proxy From f541f80b6c54eec950a2042342f8782a479893bd Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 18 Jun 2026 15:49:20 -0400 Subject: [PATCH 4/5] Tests --- .../tests/test_change_feed.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py b/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py index 8e96391d4fe3..7affbb71d13b 100644 --- a/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py +++ b/sdk/storage/azure-storage-blob-changefeed/tests/test_change_feed.py @@ -8,8 +8,7 @@ from datetime import datetime, timedelta from math import ceil from time import sleep -from types import SimpleNamespace -from unittest.mock import Mock +from unittest.mock import Mock, patch import pytest @@ -28,11 +27,15 @@ from azure.storage.blob.changefeed._models import ChangeFeed -def _container_client_returning(blob_names): - """A mock ContainerClient whose list_blobs() yields blobs with the given names.""" - client = Mock() - client.list_blobs.return_value = [SimpleNamespace(name=name) for name in blob_names] - return client +def _build_change_feed(blob_names): + """Build a ChangeFeed backed by a mock client, skipping the network-bound _initialize().""" + blobs = [Mock() for _ in blob_names] + for blob, name in zip(blobs, blob_names): + blob.name = name + client = Mock() + client.list_blobs.return_value = blobs + with patch.object(ChangeFeed, "_initialize"): + return ChangeFeed(client, page_size=100) @pytest.mark.playback_test_only @@ -76,9 +79,7 @@ def test_get_segment_paths_skips_directory_markers(self): "idx/segments/2026/02/20/0000/meta.json", # real segment "idx/segments/2026/02/20/0100/meta.json", # real segment ] - change_feed = ChangeFeed.__new__(ChangeFeed) - change_feed.client = _container_client_returning(blob_names) - change_feed.end_time = None + change_feed = _build_change_feed(blob_names) results = list(change_feed._get_segment_paths(start_year="")) @@ -92,9 +93,7 @@ def test_get_segment_paths_skips_directory_markers(self): def test_get_segment_paths_does_not_raise_on_directory_markers(self): blob_names = ["idx/segments/2026/02/20", "idx/segments/2026/02/20/0000/meta.json"] - change_feed = ChangeFeed.__new__(ChangeFeed) - change_feed.client = _container_client_returning(blob_names) - change_feed.end_time = None + change_feed = _build_change_feed(blob_names) yielded_segments = [path for path in change_feed._get_segment_paths(start_year="") if path] assert yielded_segments == ["idx/segments/2026/02/20/0000/meta.json"] From 66f027c0fd83b9644f5abc34591be3e6ca333445 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 18 Jun 2026 19:17:18 -0400 Subject: [PATCH 5/5] PR feedback --- .../azure/storage/blob/changefeed/_models.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py b/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py index 390195e1666a..1baacbbd99ae 100644 --- a/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py +++ b/sdk/storage/azure-storage-blob-changefeed/azure/storage/blob/changefeed/_models.py @@ -299,16 +299,7 @@ def _is_valid_segment_path(segment_path): path_tokens = segment_path.split(PATH_DELIMITER) # Expected: idx/segments/YYYY/MM/DD/HHMM/ - if len(path_tokens) < 7 or not path_tokens[6]: - return False - try: - int(path_tokens[2]) # year - int(path_tokens[3]) # month - int(path_tokens[4]) # day - int(path_tokens[5][:2]) # hour (from HHMM) - except (ValueError, IndexError): - return False - return True + return len(path_tokens) >= 7 and path_tokens[6] @staticmethod def _parse_datetime_from_segment_path(segment_path):