From 37f7f4572ef8ca7864760826de9596e0cdb5d59d Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Wed, 13 May 2026 14:21:27 +0800 Subject: [PATCH] fix: count text chunker orphan glue by tokens --- dotnet/src/SemanticKernel.Core/Text/TextChunker.cs | 10 +++------- .../SemanticKernel.UnitTests/Text/TextChunkerTests.cs | 11 +++++++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs index d8f4a32b4e3c..c4cf1ea9383c 100644 --- a/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs +++ b/dotnet/src/SemanticKernel.Core/Text/TextChunker.cs @@ -195,15 +195,11 @@ private static List ProcessParagraphs(List paragraphs, int adjus var lastParagraphTokens = lastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); var secondLastParagraphTokens = secondLastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); - var lastParagraphTokensCount = lastParagraphTokens.Length; - var secondLastParagraphTokensCount = secondLastParagraphTokens.Length; + var combinedParagraph = string.Join(" ", secondLastParagraphTokens.Concat(lastParagraphTokens)); - if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph) + if (GetTokenCount(combinedParagraph, tokenCounter) <= adjustedMaxTokensPerParagraph) { - var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens); - var newLastParagraph = string.Join(" ", lastParagraphTokens); - - paragraphs[paragraphs.Count - 2] = $"{newSecondLastParagraph} {newLastParagraph}"; + paragraphs[paragraphs.Count - 2] = combinedParagraph; paragraphs.RemoveAt(paragraphs.Count - 1); } } diff --git a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs index a31f077eef66..2098158e0299 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Text/TextChunkerTests.cs @@ -819,4 +819,15 @@ public void SplitPlainTextParagraphsSplitsWhenExceedingTokenLimit() Assert.Contains("Second line", combined); Assert.Contains("Third line", combined); } + + [Fact] + public void SplitPlainTextParagraphsDoesNotGlueLastParagraphPastTokenLimit() + { + var lines = new[] { new string('a', 14), "bbb" }; + + var result = TextChunker.SplitPlainTextParagraphs(lines, 16, tokenCounter: input => input.Length); + + Assert.Equal(2, result.Count); + Assert.All(result, paragraph => Assert.True(paragraph.Length <= 16)); + } }