@@ -309,3 +309,75 @@ TEST(TensorCUDATest, TrilBf16) {
309309 exp = {1 , 1 , 0 , 0 , 1 , 1 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 };
310310 tensor_is_close<bfloat16>(diag.span (), std::span (exp));
311311}
312+
313+ TEST (TensorCUDATest, SliceBf16FirstDim) {
314+ SKIP_IF_NO_GPU ();
315+ // Tensor shape {4, 3}: 4 rows, 3 cols
316+ // Data: row0=[1,2,3], row1=[4,5,6], row2=[7,8,9], row3=[10,11,12]
317+ Tensor<bfloat16, CPU> tensor ({4 , 3 });
318+ for (int i = 0 ; i < 12 ; ++i) {
319+ tensor.set_ (i, bfloat16 (i + 1 ));
320+ }
321+
322+ auto gpu_tensor = tensor.cuda ();
323+
324+ // Slice rows 1 to 3 (exclusive), so rows 1 and 2
325+ Tensor<bfloat16, CUDA> result = slice (gpu_tensor.view (), 0 , 1 , 3 );
326+
327+ auto result_cpu = result.cpu ();
328+
329+ Shape expected_shape = {2 , 3 };
330+ EXPECT_EQ (result_cpu.shape (), expected_shape);
331+
332+ // Expected: row1=[4,5,6], row2=[7,8,9]
333+ std::vector<bfloat16> exp = {4 , 5 , 6 , 7 , 8 , 9 };
334+ tensor_is_close<bfloat16>(result_cpu.span (), std::span (exp));
335+ }
336+
337+ TEST (TensorCUDATest, SliceBf16LastDim) {
338+ SKIP_IF_NO_GPU ();
339+ // Tensor shape {2, 6}
340+ // Data: row0=[1,2,3,4,5,6], row1=[7,8,9,10,11,12]
341+ Tensor<bfloat16, CPU> tensor ({2 , 6 });
342+ for (int i = 0 ; i < 12 ; ++i) {
343+ tensor.set_ (i, bfloat16 (i + 1 ));
344+ }
345+
346+ auto gpu_tensor = tensor.cuda ();
347+
348+ // Slice cols 2 to 5 (exclusive), so cols 2, 3, 4
349+ Tensor<bfloat16, CUDA> result = slice (gpu_tensor.view (), 1 , 2 , 5 );
350+
351+ auto result_cpu = result.cpu ();
352+
353+ Shape expected_shape = {2 , 3 };
354+ EXPECT_EQ (result_cpu.shape (), expected_shape);
355+
356+ // Expected: row0=[3,4,5], row1=[9,10,11]
357+ std::vector<bfloat16> exp = {3 , 4 , 5 , 9 , 10 , 11 };
358+ tensor_is_close<bfloat16>(result_cpu.span (), std::span (exp));
359+ }
360+
361+ TEST (TensorCUDATest, SliceBf16MiddleDim) {
362+ SKIP_IF_NO_GPU ();
363+ // Tensor shape {2, 4, 3}: 2 batches, 4 rows, 3 cols
364+ Tensor<bfloat16, CPU> tensor ({2 , 4 , 3 });
365+ for (int i = 0 ; i < 24 ; ++i) {
366+ tensor.set_ (i, bfloat16 (i + 1 ));
367+ }
368+
369+ auto gpu_tensor = tensor.cuda ();
370+
371+ // Slice dim 1 (rows) from 1 to 3, keeping 2 rows
372+ Tensor<bfloat16, CUDA> result = slice (gpu_tensor.view (), 1 , 1 , 3 );
373+
374+ auto result_cpu = result.cpu ();
375+
376+ Shape expected_shape = {2 , 2 , 3 };
377+ EXPECT_EQ (result_cpu.shape (), expected_shape);
378+
379+ // Batch 0: rows 1-2 = [4,5,6, 7,8,9]
380+ // Batch 1: rows 1-2 = [16,17,18, 19,20,21]
381+ std::vector<bfloat16> exp = {4 , 5 , 6 , 7 , 8 , 9 , 16 , 17 , 18 , 19 , 20 , 21 };
382+ tensor_is_close<bfloat16>(result_cpu.span (), std::span (exp));
383+ }
0 commit comments