From e56e685484064ea98cf5225a389f75b48af99d07 Mon Sep 17 00:00:00 2001 From: Jens Geyer Date: Sat, 16 May 2026 00:44:41 +0200 Subject: [PATCH] CASSANDRA-21131: Fix CSV COPY TO/FROM corrupting text values containing backslashes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit format_value_text in formatting.py doubles backslashes for terminal display (so SELECT output renders them visibly). When used via ExportProcess.format_value for COPY TO, this pre-escaping is applied before csv.writer runs its own backslash escaping (escapechar='\\'), resulting in quadrupled backslashes in the CSV file. On COPY FROM the csv.reader unescapes once, leaving doubled backslashes in Cassandra — data corruption that compounds on every round-trip. The fix adds an escape_backslash parameter (default True, preserving existing terminal display behaviour) and passes escape_backslash=False from the CSV export path in ExportProcess.format_value. The parameter is propagated through format_simple_collection, format_value_list/set/tuple/map, and format_value_utype so that collection types (list, set, map, UDTs) are covered as well. Generated-by: Claude Sonnet 4.6 (Anthropic) with human review and direction --- pylib/cqlshlib/copyutil.py | 1 + pylib/cqlshlib/formatting.py | 41 +++++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/pylib/cqlshlib/copyutil.py b/pylib/cqlshlib/copyutil.py index 9586486af1cd..6b9d77bd9009 100644 --- a/pylib/cqlshlib/copyutil.py +++ b/pylib/cqlshlib/copyutil.py @@ -1752,6 +1752,7 @@ def format_value(self, val, cqltype): formatted = formatter(val, cqltype=cqltype, encoding=self.encoding, colormap=NO_COLOR_MAP, date_time_format=self.date_time_format, float_precision=cqltype.precision, nullval=self.nullval, quote=False, + escape_backslash=False, decimal_sep=self.decimal_sep, thousands_sep=self.thousands_sep, boolean_styles=self.boolean_styles) return formatted diff --git a/pylib/cqlshlib/formatting.py b/pylib/cqlshlib/formatting.py index cdf36e0c5308..436b8882e965 100644 --- a/pylib/cqlshlib/formatting.py +++ b/pylib/cqlshlib/formatting.py @@ -477,8 +477,8 @@ def decode_zig_zag_64(n): @formatter_for('str') -def format_value_text(val, encoding, colormap, quote=False, **_): - escapedval = val.replace('\\', '\\\\') +def format_value_text(val, encoding, colormap, quote=False, escape_backslash=True, **_): + escapedval = val.replace('\\', '\\\\') if escape_backslash else val if quote: escapedval = escapedval.replace("'", "''") escapedval = UNICODE_CONTROLCHARS_RE.sub(_show_control_chars, escapedval) @@ -496,11 +496,13 @@ def format_value_text(val, encoding, colormap, quote=False, **_): def format_simple_collection(val, cqltype, lbracket, rbracket, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles): + decimal_sep, thousands_sep, boolean_styles, + escape_backslash=True): subs = [format_value(sval, cqltype=stype, encoding=encoding, colormap=colormap, date_time_format=date_time_format, float_precision=float_precision, - nullval=nullval, quote=True, decimal_sep=decimal_sep, - thousands_sep=thousands_sep, boolean_styles=boolean_styles) + nullval=nullval, quote=True, escape_backslash=escape_backslash, + decimal_sep=decimal_sep, thousands_sep=thousands_sep, + boolean_styles=boolean_styles) for sval, stype in zip(val, cqltype.get_n_sub_types(len(val)))] bval = lbracket + ', '.join(get_str(sval) for sval in subs) + rbracket if colormap is NO_COLOR_MAP: @@ -515,26 +517,29 @@ def format_simple_collection(val, cqltype, lbracket, rbracket, encoding, @formatter_for('list') def format_value_list(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_backslash=True, **_): return format_simple_collection(val, cqltype, '[', ']', encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles) + decimal_sep, thousands_sep, boolean_styles, + escape_backslash=escape_backslash) @formatter_for('tuple') def format_value_tuple(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_backslash=True, **_): return format_simple_collection(val, cqltype, '(', ')', encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles) + decimal_sep, thousands_sep, boolean_styles, + escape_backslash=escape_backslash) @formatter_for('set') def format_value_set(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_backslash=True, **_): return format_simple_collection(val, cqltype, '{', '}', encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles) + decimal_sep, thousands_sep, boolean_styles, + escape_backslash=escape_backslash) formatter_for('frozenset')(format_value_set) @@ -544,12 +549,13 @@ def format_value_set(val, cqltype, encoding, colormap, date_time_format, float_p @formatter_for('dict') def format_value_map(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_backslash=True, **_): def subformat(v, t): return format_value(v, cqltype=t, encoding=encoding, colormap=colormap, date_time_format=date_time_format, float_precision=float_precision, - nullval=nullval, quote=True, decimal_sep=decimal_sep, - thousands_sep=thousands_sep, boolean_styles=boolean_styles) + nullval=nullval, quote=True, escape_backslash=escape_backslash, + decimal_sep=decimal_sep, thousands_sep=thousands_sep, + boolean_styles=boolean_styles) subs = [(subformat(k, cqltype.sub_types[0]), subformat(v, cqltype.sub_types[1])) for (k, v) in sorted(val.items())] bval = '{' + ', '.join(get_str(k) + ': ' + get_str(v) for (k, v) in subs) + '}' @@ -572,14 +578,15 @@ def subformat(v, t): def format_value_utype(val, cqltype, encoding, colormap, date_time_format, float_precision, nullval, - decimal_sep, thousands_sep, boolean_styles, **_): + decimal_sep, thousands_sep, boolean_styles, escape_backslash=True, **_): def format_field_value(v, t): if v is None: return colorme(nullval, colormap, 'error') return format_value(v, cqltype=t, encoding=encoding, colormap=colormap, date_time_format=date_time_format, float_precision=float_precision, - nullval=nullval, quote=True, decimal_sep=decimal_sep, - thousands_sep=thousands_sep, boolean_styles=boolean_styles) + nullval=nullval, quote=True, escape_backslash=escape_backslash, + decimal_sep=decimal_sep, thousands_sep=thousands_sep, + boolean_styles=boolean_styles) def format_field_name(name): return format_value_text(name, encoding=encoding, colormap=colormap, quote=False)