1515"""
1616
1717import pytest
18+ import pyarrow as pa
1819from unittest .mock import MagicMock , Mock , patch
1920from pyiceberg .table import DataScan
2021from pyiceberg .expressions import AlwaysTrue
22+ from pyiceberg .schema import Schema
23+ from pyiceberg .types import NestedField , StringType , IntegerType , BooleanType
2124
2225
2326class DummyFile :
@@ -60,6 +63,7 @@ def test_count_basic():
6063 """
6164 # Create a mock table with the necessary attributes
6265 scan = Mock (spec = DataScan )
66+ scan .limit = None # Add the limit attribute for our fix
6367
6468 # Mock the plan_files method to return our dummy task
6569 task = DummyTask (42 , residual = AlwaysTrue (), delete_files = [])
@@ -87,6 +91,7 @@ def test_count_empty():
8791 """
8892 # Create a mock table with the necessary attributes
8993 scan = Mock (spec = DataScan )
94+ scan .limit = None # Add the limit attribute for our fix
9095
9196 # Mock the plan_files method to return no tasks
9297 scan .plan_files = MagicMock (return_value = [])
@@ -114,6 +119,7 @@ def test_count_large():
114119 """
115120 # Create a mock table with the necessary attributes
116121 scan = Mock (spec = DataScan )
122+ scan .limit = None # Add the limit attribute for our fix
117123
118124 # Mock the plan_files method to return multiple tasks
119125 tasks = [
@@ -126,4 +132,123 @@ def test_count_large():
126132 from pyiceberg .table import DataScan as ActualDataScan
127133 scan .count = ActualDataScan .count .__get__ (scan , ActualDataScan )
128134
129- assert scan .count () == 1000000
135+ assert scan .count () == 1000000
136+
137+
138+ def test_count_with_limit_mock ():
139+ """
140+ Test count functionality with limit using mocked data.
141+
142+ This test verifies that the count() method respects limits when set,
143+ using mock objects to simulate different scenarios without requiring
144+ integration services.
145+ """
146+ # Test Case 1: Limit smaller than total records
147+ scan = Mock (spec = DataScan )
148+ scan .limit = 5 # Set limit
149+
150+ tasks = [
151+ DummyTask (3 , residual = AlwaysTrue (), delete_files = []),
152+ DummyTask (4 , residual = AlwaysTrue (), delete_files = []),
153+ DummyTask (2 , residual = AlwaysTrue (), delete_files = []), # Total = 9 records
154+ ]
155+ scan .plan_files = MagicMock (return_value = tasks )
156+
157+ from pyiceberg .table import DataScan as ActualDataScan
158+ scan .count = ActualDataScan .count .__get__ (scan , ActualDataScan )
159+
160+ result = scan .count ()
161+ assert result == 5 , f"Expected count to respect limit=5, got { result } "
162+
163+ # Test Case 2: Limit larger than available data
164+ scan2 = Mock (spec = DataScan )
165+ scan2 .limit = 15 # Limit larger than data
166+
167+ tasks2 = [
168+ DummyTask (3 , residual = AlwaysTrue (), delete_files = []),
169+ DummyTask (2 , residual = AlwaysTrue (), delete_files = []), # Total = 5 records
170+ ]
171+ scan2 .plan_files = MagicMock (return_value = tasks2 )
172+ scan2 .count = ActualDataScan .count .__get__ (scan2 , ActualDataScan )
173+
174+ result2 = scan2 .count ()
175+ assert result2 == 5 , f"Expected count=5 (all available), got { result2 } with limit=15"
176+
177+ # Test Case 3: Limit equals total records
178+ scan3 = Mock (spec = DataScan )
179+ scan3 .limit = 7 # Exact match
180+
181+ tasks3 = [
182+ DummyTask (4 , residual = AlwaysTrue (), delete_files = []),
183+ DummyTask (3 , residual = AlwaysTrue (), delete_files = []), # Total = 7 records
184+ ]
185+ scan3 .plan_files = MagicMock (return_value = tasks3 )
186+ scan3 .count = ActualDataScan .count .__get__ (scan3 , ActualDataScan )
187+
188+ result3 = scan3 .count ()
189+ assert result3 == 7 , f"Expected count=7 (exact limit), got { result3 } "
190+
191+ def test_datascan_count_respects_limit (session_catalog ):
192+ """
193+ Test that DataScan.count() respects the limit parameter.
194+
195+ This test verifies the fix for issue #2121 where count() was ignoring
196+ the limit and returning the total table row count instead of being
197+ bounded by the scan limit.
198+ """
199+ import uuid
200+
201+ # Create a simple schema
202+ schema = Schema (
203+ NestedField (1 , "str" , StringType (), required = False ),
204+ NestedField (2 , "int" , IntegerType (), required = False ),
205+ NestedField (3 , "bool" , BooleanType (), required = False )
206+ )
207+
208+ # Use a unique table name to avoid conflicts
209+ table_name = f"default.test_limit_{ uuid .uuid4 ().hex [:8 ]} "
210+
211+ try :
212+ # Try to drop table if it exists
213+ try :
214+ session_catalog .drop_table (table_name )
215+ except :
216+ pass # Table might not exist, which is fine
217+
218+ # Create a table with more rows than our test limits
219+ table = session_catalog .create_table (table_name , schema = schema )
220+
221+ # Add 10 rows to ensure we have enough data
222+ records = [
223+ {"str" : f"foo{ i } " , "int" : i , "bool" : True } for i in range (10 )
224+ ]
225+ table .append (
226+ pa .Table .from_pylist (records , schema = table .schema ().as_arrow ())
227+ )
228+
229+ # Test Case 1: Basic limit functionality
230+ scan_limit_3 = table .scan (limit = 3 )
231+ count_3 = scan_limit_3 .count ()
232+ assert count_3 == 3 , f"Expected count to respect limit=3, got { count_3 } "
233+
234+ # Test Case 2: Limit larger than table size
235+ scan_limit_20 = table .scan (limit = 20 )
236+ count_20 = scan_limit_20 .count ()
237+ assert count_20 == 10 , f"Expected count=10 (all rows), got { count_20 } with limit=20"
238+
239+ # Test Case 3: No limit should return all rows
240+ scan_no_limit = table .scan ()
241+ count_all = scan_no_limit .count ()
242+ assert count_all == 10 , f"Expected count=10 (all rows), got { count_all } without limit"
243+
244+ # Test Case 4: Edge case - limit of 1
245+ scan_limit_1 = table .scan (limit = 1 )
246+ count_1 = scan_limit_1 .count ()
247+ assert count_1 == 1 , f"Expected count to respect limit=1, got { count_1 } "
248+
249+ finally :
250+ # Clean up the test table
251+ try :
252+ session_catalog .drop_table (table_name )
253+ except :
254+ pass # Ignore cleanup errors
0 commit comments