Skip to content

Commit 319bdd4

Browse files
JacobCoffeeclaude
andauthored
feat: implement smart text chunking for embed fields (#141)
* feat: implement smart text chunking for embed fields Resolves GH #75 - Don't break markdown links in Ruff output. - Add smart_chunk_text() that respects markdown structures - Preserves markdown links, inline code, and code blocks - Prefers natural split points (paragraphs > sentences > newlines) - Update Ruff command to use smart chunking - Add 15 tests for the new chunking function Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: address PR review for smart chunking - Add max_size validation to prevent infinite loop (raises ValueError if <= 0) - Improve code block test to verify fences stay in same chunk - Add test for max_size validation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 3a14d45 commit 319bdd4

3 files changed

Lines changed: 252 additions & 4 deletions

File tree

services/bot/src/byte_bot/lib/utils.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"query_all_peps",
3838
"query_all_ruff_rules",
3939
"run_ruff_format",
40+
"smart_chunk_text",
4041
)
4142

4243

@@ -153,6 +154,94 @@ def chunk_sequence[T](sequence: Iterable[T], size: int) -> Iterable[tuple[T, ...
153154
yield chunk
154155

155156

157+
def _find_protected_regions(text: str) -> list[tuple[int, int]]:
158+
"""Find markdown structures that should not be split."""
159+
pattern = re.compile(
160+
r"```[\s\S]*?```" # code blocks
161+
r"|`[^`\n]+`" # inline code
162+
r"|\[[^\]]*\]\([^)]*\)" # markdown links
163+
)
164+
return [(m.start(), m.end()) for m in pattern.finditer(text)]
165+
166+
167+
def _is_position_protected(pos: int, regions: list[tuple[int, int]]) -> bool:
168+
"""Check if a position falls within a protected region."""
169+
return any(start <= pos < end for start, end in regions)
170+
171+
172+
def _find_split_point(
173+
text_segment: str, start_offset: int, max_size: int, protected_regions: list[tuple[int, int]]
174+
) -> int:
175+
"""Find the best split point within max_size that respects protected regions."""
176+
search_start = max(0, max_size - 200)
177+
178+
for sep in ["\n\n", ". ", "! ", "? ", "\n", " "]:
179+
pos = max_size
180+
while pos > search_start:
181+
idx = text_segment.rfind(sep, search_start, pos)
182+
if idx == -1:
183+
break
184+
if not _is_position_protected(start_offset + idx, protected_regions):
185+
return idx + len(sep)
186+
pos = idx
187+
188+
for i in range(max_size, search_start, -1):
189+
if not _is_position_protected(start_offset + i, protected_regions):
190+
return i
191+
192+
return max_size
193+
194+
195+
def smart_chunk_text(text: str, max_size: int = 1000) -> list[str]:
196+
"""Split text into chunks without breaking markdown structures.
197+
198+
Respects markdown links, inline code, and code blocks. Prefers splitting at
199+
natural boundaries: paragraphs > sentences > newlines > spaces.
200+
201+
Args:
202+
text: The text to chunk.
203+
max_size: Maximum characters per chunk (must be > 0).
204+
205+
Returns:
206+
List of text chunks.
207+
208+
Raises:
209+
ValueError: If max_size is not positive.
210+
"""
211+
if max_size <= 0:
212+
msg = f"max_size must be positive, got {max_size}"
213+
raise ValueError(msg)
214+
215+
if not text:
216+
return []
217+
218+
if len(text) <= max_size:
219+
return [text]
220+
221+
protected_regions = _find_protected_regions(text)
222+
chunks: list[str] = []
223+
current_pos = 0
224+
225+
while current_pos < len(text):
226+
remaining = text[current_pos:]
227+
228+
if len(remaining) <= max_size:
229+
chunks.append(remaining)
230+
break
231+
232+
split_at = _find_split_point(remaining, current_pos, max_size, protected_regions)
233+
chunk = remaining[:split_at].rstrip()
234+
235+
if chunk:
236+
chunks.append(chunk)
237+
238+
current_pos += split_at
239+
while current_pos < len(text) and text[current_pos] in " \n":
240+
current_pos += 1
241+
242+
return chunks
243+
244+
156245
def format_resolution_link(resolution: str | None) -> str:
157246
"""Formats the resolution URL into a markdown link.
158247

services/bot/src/byte_bot/plugins/astral.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from byte_bot.lib.common.assets import ruff_logo
1313
from byte_bot.lib.common.colors import astral_purple, astral_yellow
14-
from byte_bot.lib.utils import chunk_sequence, format_ruff_rule, query_all_ruff_rules
14+
from byte_bot.lib.utils import format_ruff_rule, query_all_ruff_rules, smart_chunk_text
1515
from byte_bot.views.astral import RuffView
1616

1717
if TYPE_CHECKING:
@@ -69,9 +69,8 @@ async def ruff_rule(self, interaction: Interaction, rule: str) -> None:
6969
embed = Embed(title=f"Ruff Rule: {formatted_rule_details['name']}", color=astral_yellow)
7070
embed.add_field(name="Summary", value=formatted_rule_details["summary"], inline=False)
7171

72-
# TODO: Better chunking
73-
for idx, chunk in enumerate(chunk_sequence(formatted_rule_details["explanation"], 1000)):
74-
embed.add_field(name="" if idx else "Explanation", value="".join(chunk), inline=False)
72+
for idx, chunk in enumerate(smart_chunk_text(formatted_rule_details["explanation"], 1000)):
73+
embed.add_field(name="" if idx else "Explanation", value=chunk, inline=False)
7574

7675
embed.add_field(name="Fix", value=formatted_rule_details["fix"], inline=False)
7776
embed.add_field(name="Documentation", value=docs_field, inline=False)

tests/unit/bot/lib/test_utils.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
query_all_peps,
2222
query_all_ruff_rules,
2323
run_ruff_format,
24+
smart_chunk_text,
2425
)
2526

2627

@@ -952,3 +953,162 @@ def test_get_next_friday_with_zero_delay(self) -> None:
952953

953954
# Should be same Friday
954955
assert start_dt_no_delay.day == start_dt_zero_delay.day
956+
957+
958+
class TestSmartChunkText:
959+
"""Tests for smart_chunk_text function."""
960+
961+
def test_empty_text(self) -> None:
962+
"""Test with empty text."""
963+
result = smart_chunk_text("")
964+
assert result == []
965+
966+
def test_invalid_max_size_raises_error(self) -> None:
967+
"""Test that non-positive max_size raises ValueError."""
968+
import pytest
969+
970+
with pytest.raises(ValueError, match="max_size must be positive"):
971+
smart_chunk_text("test", max_size=0)
972+
973+
with pytest.raises(ValueError, match="max_size must be positive"):
974+
smart_chunk_text("test", max_size=-1)
975+
976+
def test_text_within_limit(self) -> None:
977+
"""Test text that fits within max_size."""
978+
text = "Short text"
979+
result = smart_chunk_text(text, 100)
980+
assert result == ["Short text"]
981+
982+
def test_splits_at_paragraph_boundary(self) -> None:
983+
"""Test that splitting prefers paragraph boundaries."""
984+
text = "First paragraph.\n\nSecond paragraph."
985+
result = smart_chunk_text(text, 25)
986+
assert len(result) == 2
987+
assert result[0] == "First paragraph."
988+
assert result[1] == "Second paragraph."
989+
990+
def test_splits_at_sentence_boundary(self) -> None:
991+
"""Test that splitting falls back to sentence boundaries."""
992+
text = "First sentence. Second sentence. Third sentence."
993+
result = smart_chunk_text(text, 35)
994+
assert len(result) >= 2
995+
assert all(len(chunk) <= 35 for chunk in result)
996+
assert "First sentence." in result[0]
997+
998+
def test_splits_at_newline(self) -> None:
999+
"""Test that splitting falls back to newlines."""
1000+
text = "Line one\nLine two\nLine three"
1001+
result = smart_chunk_text(text, 15)
1002+
assert len(result) >= 2
1003+
assert all(len(chunk) <= 15 for chunk in result)
1004+
1005+
def test_splits_at_word_boundary(self) -> None:
1006+
"""Test that splitting falls back to word boundaries."""
1007+
text = "word1 word2 word3 word4 word5"
1008+
result = smart_chunk_text(text, 12)
1009+
assert len(result) >= 2
1010+
assert all(len(chunk) <= 12 for chunk in result)
1011+
1012+
def test_preserves_markdown_links(self) -> None:
1013+
"""Test that markdown links are not broken when they fit within max_size."""
1014+
text = (
1015+
"First paragraph with some text here.\n\n"
1016+
"Check [this link](https://example.com/path) for more info.\n\n"
1017+
"Third paragraph with more content."
1018+
)
1019+
result = smart_chunk_text(text, 80)
1020+
for chunk in result:
1021+
if "[this link]" in chunk:
1022+
assert "[this link](https://example.com/path)" in chunk
1023+
break
1024+
else:
1025+
combined = " ".join(result)
1026+
assert "[this link](https://example.com/path)" in combined
1027+
1028+
def test_preserves_inline_code(self) -> None:
1029+
"""Test that inline code is not broken when it fits within max_size."""
1030+
text = "First sentence here.\n\nUse the `my_function()` method here.\n\nMore text follows."
1031+
result = smart_chunk_text(text, 50)
1032+
for chunk in result:
1033+
if "`my_function()`" in chunk:
1034+
break
1035+
else:
1036+
combined = " ".join(result)
1037+
assert "`my_function()`" in combined
1038+
1039+
def test_preserves_code_blocks(self) -> None:
1040+
"""Test that code blocks stay in the same chunk when they fit within max_size."""
1041+
text = "Example:\n\n```python\ndef foo():\n pass\n```\n\nEnd of content."
1042+
result = smart_chunk_text(text, 60)
1043+
1044+
block_chunk_found = False
1045+
for chunk in result:
1046+
backtick_fence_count = chunk.count("```")
1047+
assert backtick_fence_count in (0, 2), "Chunk should not contain unmatched code fence"
1048+
1049+
if "```python" in chunk:
1050+
assert "def foo():" in chunk, "Code block content should stay with opening fence"
1051+
assert chunk.count("```") == 2, "Opening and closing fences should be in same chunk"
1052+
block_chunk_found = True
1053+
1054+
assert block_chunk_found, "Should find a chunk containing the code block"
1055+
1056+
def test_chunks_do_not_exceed_max_size(self) -> None:
1057+
"""Test that all chunks respect max_size limit."""
1058+
text = "A" * 500 + " " + "B" * 500 + " " + "C" * 500
1059+
result = smart_chunk_text(text, 600)
1060+
for chunk in result:
1061+
assert len(chunk) <= 600
1062+
1063+
def test_multiple_markdown_links(self) -> None:
1064+
"""Test with multiple markdown links."""
1065+
text = (
1066+
"See [link1](https://example.com/1) and [link2](https://example.com/2) "
1067+
"for more details on [link3](https://example.com/3)."
1068+
)
1069+
result = smart_chunk_text(text, 60)
1070+
combined = " ".join(result)
1071+
assert "[link1](https://example.com/1)" in combined
1072+
assert "[link2](https://example.com/2)" in combined
1073+
assert "[link3](https://example.com/3)" in combined
1074+
1075+
def test_mixed_protected_regions(self) -> None:
1076+
"""Test with mixed markdown links and code."""
1077+
text = "Use `code` and check [docs](https://docs.example.com) for help."
1078+
result = smart_chunk_text(text, 40)
1079+
combined = " ".join(result)
1080+
assert "`code`" in combined
1081+
assert "[docs](https://docs.example.com)" in combined
1082+
1083+
def test_long_code_block_preserved(self) -> None:
1084+
"""Test that long code blocks spanning multiple lines are preserved."""
1085+
code_block = "```\nline1\nline2\nline3\nline4\n```"
1086+
text = f"Before\n\n{code_block}\n\nAfter"
1087+
result = smart_chunk_text(text, 50)
1088+
combined = "".join(result)
1089+
assert code_block in combined
1090+
1091+
def test_default_max_size(self) -> None:
1092+
"""Test that default max_size is 1000."""
1093+
text = "A" * 1500
1094+
result = smart_chunk_text(text)
1095+
assert len(result) >= 2
1096+
assert len(result[0]) <= 1000
1097+
1098+
def test_real_world_ruff_explanation(self) -> None:
1099+
"""Test with content similar to ruff rule explanations."""
1100+
text = (
1101+
"**Why is this bad?**\n\n"
1102+
"Long lines make code hard to read. See [PEP 8](https://pep8.org) for guidelines.\n\n"
1103+
"**Example**\n\n"
1104+
"```python\n"
1105+
"x = 'very long string'\n"
1106+
"```\n\n"
1107+
"**Fix**\n\n"
1108+
"Break the line using parentheses."
1109+
)
1110+
result = smart_chunk_text(text, 100)
1111+
combined = "".join(result)
1112+
assert "[PEP 8](https://pep8.org)" in combined
1113+
assert "```python" in combined
1114+
assert "```" in combined

0 commit comments

Comments
 (0)