|
21 | 21 | query_all_peps, |
22 | 22 | query_all_ruff_rules, |
23 | 23 | run_ruff_format, |
| 24 | + smart_chunk_text, |
24 | 25 | ) |
25 | 26 |
|
26 | 27 |
|
@@ -952,3 +953,162 @@ def test_get_next_friday_with_zero_delay(self) -> None: |
952 | 953 |
|
953 | 954 | # Should be same Friday |
954 | 955 | assert start_dt_no_delay.day == start_dt_zero_delay.day |
| 956 | + |
| 957 | + |
| 958 | +class TestSmartChunkText: |
| 959 | + """Tests for smart_chunk_text function.""" |
| 960 | + |
| 961 | + def test_empty_text(self) -> None: |
| 962 | + """Test with empty text.""" |
| 963 | + result = smart_chunk_text("") |
| 964 | + assert result == [] |
| 965 | + |
| 966 | + def test_invalid_max_size_raises_error(self) -> None: |
| 967 | + """Test that non-positive max_size raises ValueError.""" |
| 968 | + import pytest |
| 969 | + |
| 970 | + with pytest.raises(ValueError, match="max_size must be positive"): |
| 971 | + smart_chunk_text("test", max_size=0) |
| 972 | + |
| 973 | + with pytest.raises(ValueError, match="max_size must be positive"): |
| 974 | + smart_chunk_text("test", max_size=-1) |
| 975 | + |
| 976 | + def test_text_within_limit(self) -> None: |
| 977 | + """Test text that fits within max_size.""" |
| 978 | + text = "Short text" |
| 979 | + result = smart_chunk_text(text, 100) |
| 980 | + assert result == ["Short text"] |
| 981 | + |
| 982 | + def test_splits_at_paragraph_boundary(self) -> None: |
| 983 | + """Test that splitting prefers paragraph boundaries.""" |
| 984 | + text = "First paragraph.\n\nSecond paragraph." |
| 985 | + result = smart_chunk_text(text, 25) |
| 986 | + assert len(result) == 2 |
| 987 | + assert result[0] == "First paragraph." |
| 988 | + assert result[1] == "Second paragraph." |
| 989 | + |
| 990 | + def test_splits_at_sentence_boundary(self) -> None: |
| 991 | + """Test that splitting falls back to sentence boundaries.""" |
| 992 | + text = "First sentence. Second sentence. Third sentence." |
| 993 | + result = smart_chunk_text(text, 35) |
| 994 | + assert len(result) >= 2 |
| 995 | + assert all(len(chunk) <= 35 for chunk in result) |
| 996 | + assert "First sentence." in result[0] |
| 997 | + |
| 998 | + def test_splits_at_newline(self) -> None: |
| 999 | + """Test that splitting falls back to newlines.""" |
| 1000 | + text = "Line one\nLine two\nLine three" |
| 1001 | + result = smart_chunk_text(text, 15) |
| 1002 | + assert len(result) >= 2 |
| 1003 | + assert all(len(chunk) <= 15 for chunk in result) |
| 1004 | + |
| 1005 | + def test_splits_at_word_boundary(self) -> None: |
| 1006 | + """Test that splitting falls back to word boundaries.""" |
| 1007 | + text = "word1 word2 word3 word4 word5" |
| 1008 | + result = smart_chunk_text(text, 12) |
| 1009 | + assert len(result) >= 2 |
| 1010 | + assert all(len(chunk) <= 12 for chunk in result) |
| 1011 | + |
| 1012 | + def test_preserves_markdown_links(self) -> None: |
| 1013 | + """Test that markdown links are not broken when they fit within max_size.""" |
| 1014 | + text = ( |
| 1015 | + "First paragraph with some text here.\n\n" |
| 1016 | + "Check [this link](https://example.com/path) for more info.\n\n" |
| 1017 | + "Third paragraph with more content." |
| 1018 | + ) |
| 1019 | + result = smart_chunk_text(text, 80) |
| 1020 | + for chunk in result: |
| 1021 | + if "[this link]" in chunk: |
| 1022 | + assert "[this link](https://example.com/path)" in chunk |
| 1023 | + break |
| 1024 | + else: |
| 1025 | + combined = " ".join(result) |
| 1026 | + assert "[this link](https://example.com/path)" in combined |
| 1027 | + |
| 1028 | + def test_preserves_inline_code(self) -> None: |
| 1029 | + """Test that inline code is not broken when it fits within max_size.""" |
| 1030 | + text = "First sentence here.\n\nUse the `my_function()` method here.\n\nMore text follows." |
| 1031 | + result = smart_chunk_text(text, 50) |
| 1032 | + for chunk in result: |
| 1033 | + if "`my_function()`" in chunk: |
| 1034 | + break |
| 1035 | + else: |
| 1036 | + combined = " ".join(result) |
| 1037 | + assert "`my_function()`" in combined |
| 1038 | + |
| 1039 | + def test_preserves_code_blocks(self) -> None: |
| 1040 | + """Test that code blocks stay in the same chunk when they fit within max_size.""" |
| 1041 | + text = "Example:\n\n```python\ndef foo():\n pass\n```\n\nEnd of content." |
| 1042 | + result = smart_chunk_text(text, 60) |
| 1043 | + |
| 1044 | + block_chunk_found = False |
| 1045 | + for chunk in result: |
| 1046 | + backtick_fence_count = chunk.count("```") |
| 1047 | + assert backtick_fence_count in (0, 2), "Chunk should not contain unmatched code fence" |
| 1048 | + |
| 1049 | + if "```python" in chunk: |
| 1050 | + assert "def foo():" in chunk, "Code block content should stay with opening fence" |
| 1051 | + assert chunk.count("```") == 2, "Opening and closing fences should be in same chunk" |
| 1052 | + block_chunk_found = True |
| 1053 | + |
| 1054 | + assert block_chunk_found, "Should find a chunk containing the code block" |
| 1055 | + |
| 1056 | + def test_chunks_do_not_exceed_max_size(self) -> None: |
| 1057 | + """Test that all chunks respect max_size limit.""" |
| 1058 | + text = "A" * 500 + " " + "B" * 500 + " " + "C" * 500 |
| 1059 | + result = smart_chunk_text(text, 600) |
| 1060 | + for chunk in result: |
| 1061 | + assert len(chunk) <= 600 |
| 1062 | + |
| 1063 | + def test_multiple_markdown_links(self) -> None: |
| 1064 | + """Test with multiple markdown links.""" |
| 1065 | + text = ( |
| 1066 | + "See [link1](https://example.com/1) and [link2](https://example.com/2) " |
| 1067 | + "for more details on [link3](https://example.com/3)." |
| 1068 | + ) |
| 1069 | + result = smart_chunk_text(text, 60) |
| 1070 | + combined = " ".join(result) |
| 1071 | + assert "[link1](https://example.com/1)" in combined |
| 1072 | + assert "[link2](https://example.com/2)" in combined |
| 1073 | + assert "[link3](https://example.com/3)" in combined |
| 1074 | + |
| 1075 | + def test_mixed_protected_regions(self) -> None: |
| 1076 | + """Test with mixed markdown links and code.""" |
| 1077 | + text = "Use `code` and check [docs](https://docs.example.com) for help." |
| 1078 | + result = smart_chunk_text(text, 40) |
| 1079 | + combined = " ".join(result) |
| 1080 | + assert "`code`" in combined |
| 1081 | + assert "[docs](https://docs.example.com)" in combined |
| 1082 | + |
| 1083 | + def test_long_code_block_preserved(self) -> None: |
| 1084 | + """Test that long code blocks spanning multiple lines are preserved.""" |
| 1085 | + code_block = "```\nline1\nline2\nline3\nline4\n```" |
| 1086 | + text = f"Before\n\n{code_block}\n\nAfter" |
| 1087 | + result = smart_chunk_text(text, 50) |
| 1088 | + combined = "".join(result) |
| 1089 | + assert code_block in combined |
| 1090 | + |
| 1091 | + def test_default_max_size(self) -> None: |
| 1092 | + """Test that default max_size is 1000.""" |
| 1093 | + text = "A" * 1500 |
| 1094 | + result = smart_chunk_text(text) |
| 1095 | + assert len(result) >= 2 |
| 1096 | + assert len(result[0]) <= 1000 |
| 1097 | + |
| 1098 | + def test_real_world_ruff_explanation(self) -> None: |
| 1099 | + """Test with content similar to ruff rule explanations.""" |
| 1100 | + text = ( |
| 1101 | + "**Why is this bad?**\n\n" |
| 1102 | + "Long lines make code hard to read. See [PEP 8](https://pep8.org) for guidelines.\n\n" |
| 1103 | + "**Example**\n\n" |
| 1104 | + "```python\n" |
| 1105 | + "x = 'very long string'\n" |
| 1106 | + "```\n\n" |
| 1107 | + "**Fix**\n\n" |
| 1108 | + "Break the line using parentheses." |
| 1109 | + ) |
| 1110 | + result = smart_chunk_text(text, 100) |
| 1111 | + combined = "".join(result) |
| 1112 | + assert "[PEP 8](https://pep8.org)" in combined |
| 1113 | + assert "```python" in combined |
| 1114 | + assert "```" in combined |
0 commit comments