From 36aa8805c7243f4be60671d48679567abb00b713 Mon Sep 17 00:00:00 2001 From: Arvind Kandpal Date: Mon, 18 May 2026 14:55:03 +0530 Subject: [PATCH 1/3] CASSANDRA-21381: Fix control character corruption during CSV export in cqlsh --- pylib/cqlshlib/copyutil.py | 3 ++- pylib/cqlshlib/formatting.py | 41 ++++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/pylib/cqlshlib/copyutil.py b/pylib/cqlshlib/copyutil.py index 9586486af1cd..8b3b8cb19ced 100644 --- a/pylib/cqlshlib/copyutil.py +++ b/pylib/cqlshlib/copyutil.py @@ -1753,7 +1753,8 @@ def format_value(self, val, cqltype): encoding=self.encoding, colormap=NO_COLOR_MAP, date_time_format=self.date_time_format, float_precision=cqltype.precision, nullval=self.nullval, quote=False, decimal_sep=self.decimal_sep, thousands_sep=self.thousands_sep, - boolean_styles=self.boolean_styles) + boolean_styles=self.boolean_styles, + escape_control_chars=False) return formatted def close(self): diff --git a/pylib/cqlshlib/formatting.py b/pylib/cqlshlib/formatting.py index cdf36e0c5308..7798950cb963 100644 --- a/pylib/cqlshlib/formatting.py +++ b/pylib/cqlshlib/formatting.py @@ -61,7 +61,8 @@ def _turn_bits_red(match): def format_by_type(val, cqltype, encoding, colormap=None, addcolor=False, nullval=None, date_time_format=None, float_precision=None, - decimal_sep=None, thousands_sep=None, boolean_styles=None): + decimal_sep=None, thousands_sep=None, boolean_styles=None, + escape_control_chars=True): if nullval is None: nullval = default_null_placeholder if val is None: @@ -77,7 +78,7 @@ def format_by_type(val, cqltype, encoding, colormap=None, addcolor=False, return format_value(val, cqltype=cqltype, encoding=encoding, colormap=colormap, date_time_format=date_time_format, float_precision=float_precision, nullval=nullval, decimal_sep=decimal_sep, thousands_sep=thousands_sep, - boolean_styles=boolean_styles) + boolean_styles=boolean_styles, escape_control_chars=escape_control_chars) def color_text(bval, colormap, displaywidth=None): @@ -477,11 +478,12 @@ def decode_zig_zag_64(n): @formatter_for('str') -def format_value_text(val, encoding, colormap, quote=False, **_): +def format_value_text(val, encoding, colormap, quote=False, escape_control_chars=True, **_): escapedval = val.replace('\\', '\\\\') if quote: escapedval = escapedval.replace("'", "''") - escapedval = UNICODE_CONTROLCHARS_RE.sub(_show_control_chars, escapedval) + if escape_control_chars: + escapedval = UNICODE_CONTROLCHARS_RE.sub(_show_control_chars, escapedval) bval = escapedval if quote: bval = "'{}'".format(bval) @@ -496,11 +498,13 @@ def format_value_text(val, encoding, colormap, quote=False, **_): def format_simple_collection(val, cqltype, lbracket, rbracket, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles): + decimal_sep, thousands_sep, boolean_styles, + escape_control_chars=True): subs = [format_value(sval, cqltype=stype, encoding=encoding, colormap=colormap, date_time_format=date_time_format, float_precision=float_precision, nullval=nullval, quote=True, decimal_sep=decimal_sep, - thousands_sep=thousands_sep, boolean_styles=boolean_styles) + thousands_sep=thousands_sep, boolean_styles=boolean_styles, + escape_control_chars=escape_control_chars) for sval, stype in zip(val, cqltype.get_n_sub_types(len(val)))] bval = lbracket + ', '.join(get_str(sval) for sval in subs) + rbracket if colormap is NO_COLOR_MAP: @@ -515,26 +519,29 @@ def format_simple_collection(val, cqltype, lbracket, rbracket, encoding, @formatter_for('list') def format_value_list(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_control_chars=True, **_): return format_simple_collection(val, cqltype, '[', ']', encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles) + decimal_sep, thousands_sep, boolean_styles, + escape_control_chars=escape_control_chars) @formatter_for('tuple') def format_value_tuple(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_control_chars=True, **_): return format_simple_collection(val, cqltype, '(', ')', encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles) + decimal_sep, thousands_sep, boolean_styles, + escape_control_chars=escape_control_chars) @formatter_for('set') def format_value_set(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_control_chars=True, **_): return format_simple_collection(val, cqltype, '{', '}', encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles) + decimal_sep, thousands_sep, boolean_styles, + escape_control_chars=escape_control_chars) formatter_for('frozenset')(format_value_set) @@ -544,12 +551,13 @@ def format_value_set(val, cqltype, encoding, colormap, date_time_format, float_p @formatter_for('dict') def format_value_map(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_control_chars=True, **_): def subformat(v, t): return format_value(v, cqltype=t, encoding=encoding, colormap=colormap, date_time_format=date_time_format, float_precision=float_precision, nullval=nullval, quote=True, decimal_sep=decimal_sep, - thousands_sep=thousands_sep, boolean_styles=boolean_styles) + thousands_sep=thousands_sep, boolean_styles=boolean_styles, + escape_control_chars=escape_control_chars) subs = [(subformat(k, cqltype.sub_types[0]), subformat(v, cqltype.sub_types[1])) for (k, v) in sorted(val.items())] bval = '{' + ', '.join(get_str(k) + ': ' + get_str(v) for (k, v) in subs) + '}' @@ -572,14 +580,15 @@ def subformat(v, t): def format_value_utype(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_control_chars=True, **_): def format_field_value(v, t): if v is None: return colorme(nullval, colormap, 'error') return format_value(v, cqltype=t, encoding=encoding, colormap=colormap, date_time_format=date_time_format, float_precision=float_precision, nullval=nullval, quote=True, decimal_sep=decimal_sep, - thousands_sep=thousands_sep, boolean_styles=boolean_styles) + thousands_sep=thousands_sep, boolean_styles=boolean_styles, + escape_control_chars=escape_control_chars) def format_field_name(name): return format_value_text(name, encoding=encoding, colormap=colormap, quote=False) From fb55b2ec14dc47eefd8bc8e32d0efc109f52c508 Mon Sep 17 00:00:00 2001 From: Arvind Kandpal Date: Tue, 19 May 2026 11:09:10 +0530 Subject: [PATCH 2/3] Address PR review: fix UDT field name formatting and add comprehensive unit tests for control characters --- pylib/cqlshlib/formatting.py | 3 +- pylib/cqlshlib/test/test_formatting.py | 189 +++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 pylib/cqlshlib/test/test_formatting.py diff --git a/pylib/cqlshlib/formatting.py b/pylib/cqlshlib/formatting.py index 7798950cb963..c77c1b04e7ca 100644 --- a/pylib/cqlshlib/formatting.py +++ b/pylib/cqlshlib/formatting.py @@ -591,7 +591,8 @@ def format_field_value(v, t): escape_control_chars=escape_control_chars) def format_field_name(name): - return format_value_text(name, encoding=encoding, colormap=colormap, quote=False) + return format_value_text(name, encoding=encoding, colormap=colormap, quote=False, + escape_control_chars=escape_control_chars) subs = [(format_field_name(k), format_field_value(v, t)) for ((k, v), t) in zip(list(val._asdict().items()), cqltype.sub_types)] diff --git a/pylib/cqlshlib/test/test_formatting.py b/pylib/cqlshlib/test/test_formatting.py new file mode 100644 index 000000000000..ce3eb6734cdc --- /dev/null +++ b/pylib/cqlshlib/test/test_formatting.py @@ -0,0 +1,189 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from collections import OrderedDict + +from cqlshlib.displaying import NO_COLOR_MAP +from cqlshlib.formatting import ( + format_value_text, + format_value_list, + format_value_set, + format_value_tuple, + format_value_map, + format_value_utype, + CqlType +) + + +class _MockUDT: + """ Mimics the driver's UDT shape (exposes _asdict()) without the + identifier restrictions Python's namedtuple imposes on field names. """ + def __init__(self, items): + self._items = items + + def _asdict(self): + return OrderedDict(self._items) + + +class TestFormatting(unittest.TestCase): + + def setUp(self): + self.fmt_kwargs = { + 'encoding': 'utf-8', + 'colormap': NO_COLOR_MAP, + 'date_time_format': None, + 'float_precision': 3, + 'nullval': 'null', + 'decimal_sep': '.', + 'thousands_sep': ',', + 'boolean_styles': None + } + + def test_format_value_text_control_chars(self): + """ + Test that control chars are escaped for terminal display (default), + but preserved when escape_control_chars=False is passed (for CSV export). + """ + self.assertEqual( + format_value_text("Hello World", encoding='utf-8', colormap=NO_COLOR_MAP), + "Hello World" + ) + + test_string = "Hello\nWorld\x00\tTest\r" + + terminal_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP) + self.assertEqual(terminal_output, "Hello\\nWorld\\x00\\tTest\\r") + + csv_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP, escape_control_chars=False) + self.assertEqual(csv_output, test_string) + + def test_format_value_list_control_chars(self): + """ Test control character propagation in lists """ + list_val = ["line1\nline2", "null\x00byte"] + cql_type = CqlType('list') + + terminal_output = format_value_list(list_val, cqltype=cql_type, **self.fmt_kwargs) + self.assertEqual(terminal_output, "['line1\\nline2', 'null\\x00byte']") + + csv_output = format_value_list(list_val, cqltype=cql_type, escape_control_chars=False, **self.fmt_kwargs) + self.assertEqual(csv_output, "['line1\nline2', 'null\x00byte']") + + def test_format_value_map_control_chars(self): + """ Test control character propagation in map keys and values """ + map_val = {"key\n1": "val\x001"} + cql_type = CqlType('map') + + terminal_output = format_value_map(map_val, cqltype=cql_type, **self.fmt_kwargs) + self.assertEqual(terminal_output, "{'key\\n1': 'val\\x001'}") + + csv_output = format_value_map(map_val, cqltype=cql_type, escape_control_chars=False, **self.fmt_kwargs) + self.assertEqual(csv_output, "{'key\n1': 'val\x001'}") + + def test_udt_field_name_and_value_control_chars(self): + """ Test control character propagation in UDT field names and values """ + # The driver exposes UDT instances via an _asdict() shape; namedtuple + # cannot be used here because UDT field names may contain characters + # (e.g. '\n') that are not valid Python identifiers. + udt_val = _MockUDT([('field_a\n', 'val\n1'), ('field_b', 'val\x002')]) + + cql_type = CqlType('text') + cql_type.sub_types = [CqlType('text'), CqlType('text')] + + terminal_output = format_value_utype(udt_val, cqltype=cql_type, **self.fmt_kwargs) + self.assertEqual(terminal_output, "{field_a\\n: 'val\\n1', field_b: 'val\\x002'}") + + csv_output = format_value_utype(udt_val, cqltype=cql_type, escape_control_chars=False, **self.fmt_kwargs) + self.assertEqual(csv_output, "{field_a\n: 'val\n1', field_b: 'val\x002'}") + + def test_format_value_text_empty_string(self): + """ Empty strings pass through cleanly in both modes (no spurious + characters introduced by the regex sub or the escape pipeline). """ + self.assertEqual( + format_value_text("", encoding='utf-8', colormap=NO_COLOR_MAP), + "" + ) + self.assertEqual( + format_value_text("", encoding='utf-8', colormap=NO_COLOR_MAP, escape_control_chars=False), + "" + ) + + def test_format_value_text_latin1_and_del_control_chars(self): + """ UNICODE_CONTROLCHARS_RE matches [\\x00-\\x1f\\x7f-\\xa0]: in addition + to the common C0 controls, DEL (\\x7f), C1 controls (e.g. \\x80) and + NBSP (\\xa0) must also be escaped on terminals and preserved for CSV. """ + test_string = "del\x7fmid\x80end\xa0nbsp" + + terminal_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP) + self.assertEqual(terminal_output, "del\\x7fmid\\x80end\\xa0nbsp") + + csv_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP, + escape_control_chars=False) + self.assertEqual(csv_output, test_string) + + def test_format_value_text_consecutive_control_chars(self): + """ A run of adjacent control chars must be escaped/preserved + character-by-character, not collapsed. """ + test_string = "a\n\n\x00\x00b" + + terminal_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP) + self.assertEqual(terminal_output, "a\\n\\n\\x00\\x00b") + + csv_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP, + escape_control_chars=False) + self.assertEqual(csv_output, test_string) + + def test_format_value_tuple_control_chars(self): + """ format_value_tuple delegates to format_simple_collection; verify + the flag propagates to its element formatters. """ + tuple_val = ("a\n", "b\x00") + cql_type = CqlType('tuple') + + terminal_output = format_value_tuple(tuple_val, cqltype=cql_type, **self.fmt_kwargs) + self.assertEqual(terminal_output, "('a\\n', 'b\\x00')") + + csv_output = format_value_tuple(tuple_val, cqltype=cql_type, escape_control_chars=False, + **self.fmt_kwargs) + self.assertEqual(csv_output, "('a\n', 'b\x00')") + + def test_format_value_set_control_chars(self): + """ format_value_set delegates to format_simple_collection. A list is + passed here because format_simple_collection just iterates val and + CPython set iteration order depends on PYTHONHASHSEED. """ + set_val = ["a\n", "b\x00"] + cql_type = CqlType('set') + + terminal_output = format_value_set(set_val, cqltype=cql_type, **self.fmt_kwargs) + self.assertEqual(terminal_output, "{'a\\n', 'b\\x00'}") + + csv_output = format_value_set(set_val, cqltype=cql_type, escape_control_chars=False, + **self.fmt_kwargs) + self.assertEqual(csv_output, "{'a\n', 'b\x00'}") + + def test_nested_map_of_list_control_chars(self): + """ Two-level nesting (map>): the flag must propagate + through the outer map's subformat() into the inner list's element + formatters as well. Guards against regressions where the flag is + forwarded at one level but dropped at the next. """ + nested_val = {"key\n1": ["v\x001", "v\n2"]} + cql_type = CqlType('map>') + + terminal_output = format_value_map(nested_val, cqltype=cql_type, **self.fmt_kwargs) + self.assertEqual(terminal_output, "{'key\\n1': ['v\\x001', 'v\\n2']}") + + csv_output = format_value_map(nested_val, cqltype=cql_type, escape_control_chars=False, + **self.fmt_kwargs) + self.assertEqual(csv_output, "{'key\n1': ['v\x001', 'v\n2']}") \ No newline at end of file From 7599bb00701f0d4ae5fa8945bf6d18b1da00b523 Mon Sep 17 00:00:00 2001 From: Arvind Kandpal Date: Wed, 20 May 2026 12:43:26 +0530 Subject: [PATCH 3/3] CASSANDRA-21381: Fix conditional backslash doubling in format_value_text and update regression tests --- pylib/cqlshlib/formatting.py | 6 +++++- pylib/cqlshlib/test/test_formatting.py | 15 ++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pylib/cqlshlib/formatting.py b/pylib/cqlshlib/formatting.py index c77c1b04e7ca..e19b944c7b71 100644 --- a/pylib/cqlshlib/formatting.py +++ b/pylib/cqlshlib/formatting.py @@ -479,7 +479,11 @@ def decode_zig_zag_64(n): @formatter_for('str') def format_value_text(val, encoding, colormap, quote=False, escape_control_chars=True, **_): - escapedval = val.replace('\\', '\\\\') + if escape_control_chars: + escapedval = val.replace('\\', '\\\\') + else: + escapedval = val + if quote: escapedval = escapedval.replace("'", "''") if escape_control_chars: diff --git a/pylib/cqlshlib/test/test_formatting.py b/pylib/cqlshlib/test/test_formatting.py index ce3eb6734cdc..a246a0371c6e 100644 --- a/pylib/cqlshlib/test/test_formatting.py +++ b/pylib/cqlshlib/test/test_formatting.py @@ -55,22 +55,27 @@ def setUp(self): def test_format_value_text_control_chars(self): """ - Test that control chars are escaped for terminal display (default), - but preserved when escape_control_chars=False is passed (for CSV export). + Test that control chars AND literal backslashes are escaped for terminal + display (default), but BOTH are preserved verbatim when + escape_control_chars=False is passed (for CSV export). """ self.assertEqual( format_value_text("Hello World", encoding='utf-8', colormap=NO_COLOR_MAP), "Hello World" ) - test_string = "Hello\nWorld\x00\tTest\r" + test_string = "C:\\Users\\alice\nHello\x00" terminal_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP) - self.assertEqual(terminal_output, "Hello\\nWorld\\x00\\tTest\\r") + self.assertEqual(terminal_output, "C:\\\\Users\\\\alice\\nHello\\x00") - csv_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP, escape_control_chars=False) + csv_output = format_value_text(test_string, encoding='utf-8', colormap=NO_COLOR_MAP, + escape_control_chars=False) self.assertEqual(csv_output, test_string) + self.assertIn('C:\\Users', csv_output) + self.assertNotIn('C:\\\\Users', csv_output) + def test_format_value_list_control_chars(self): """ Test control character propagation in lists """ list_val = ["line1\nline2", "null\x00byte"]