(improvement) metadata: single-pass column classification in _build_table_columns

mykaul · mykaul · commit d937ce84e121 · 2026-03-14T10:59:55.000+02:00
Rewrite _build_table_columns to classify columns by kind in a single
pass instead of iterating col_rows three times with list comprehensions.
This also fixes a bug where the third pass filtered on 'clustering_key'
instead of 'clustering', causing clustering columns to leak through and
get re-processed as regular columns.

Additionally, use in-place sort() instead of sorted() to avoid creating
intermediate list copies, and append the already-built column_meta
object to partition_key/clustering_key instead of re-looking it up
from meta.columns by name.

Combined benchmark results for the full optimization series (A-F):
  Row creation + access: 323 ns/row vs 485 ns/row (1.50x faster)
  _build_table_columns:  9.0 us/table vs 9.9 us/table (1.10x faster)
  Full pipeline (100 tables x 20 cols): 0.79 ms vs 1.57 ms (1.98x faster)
  Memory per row: 48 bytes vs 272 bytes (5.7x reduction)
  __slots__ per instance: 80 bytes (saves ~104 bytes __dict__ overhead)
diff --git a/cassandra/metadata.py b/cassandra/metadata.py
@@ -2673,31 +2673,40 @@ def _build_table_options(self, row):
         return dict((o, row.get(o)) for o in self.recognized_table_options if o in row)
 
     def _build_table_columns(self, meta, col_rows, compact_static=False, is_dense=False, virtual=False):
-        # partition key
-        partition_rows = [r for r in col_rows
-                          if r.get('kind', None) == "partition_key"]
+        # Single-pass classification of column rows by kind
+        partition_rows = []
+        clustering_rows = []
+        other_rows = []
+        for r in col_rows:
+            kind = r.get('kind', None)
+            if kind == "partition_key":
+                partition_rows.append(r)
+            elif kind == "clustering":
+                if not compact_static:
+                    clustering_rows.append(r)
+                # else: skip clustering rows entirely for compact_static tables
+            else:
+                other_rows.append(r)
+
+        # partition key — must be inserted first into meta.columns for CQL export ordering
         if len(partition_rows) > 1:
-            partition_rows = sorted(partition_rows, key=lambda row: row.get('position'))
+            partition_rows.sort(key=lambda row: row.get('position'))
         for r in partition_rows:
-            # we have to add meta here (and not in the later loop) because TableMetadata.columns
-            # assumes keys are inserted first, in order, when exporting CQL
             column_meta = self._build_column_metadata(meta, r)
             meta.columns[column_meta.name] = column_meta
-            meta.partition_key.append(meta.columns[r.get('column_name')])
+            meta.partition_key.append(column_meta)
 
         # clustering key
-        if not compact_static:
-            clustering_rows = [r for r in col_rows
-                               if r.get('kind', None) == "clustering"]
+        if clustering_rows:
             if len(clustering_rows) > 1:
-                clustering_rows = sorted(clustering_rows, key=lambda row: row.get('position'))
+                clustering_rows.sort(key=lambda row: row.get('position'))
             for r in clustering_rows:
                 column_meta = self._build_column_metadata(meta, r)
                 meta.columns[column_meta.name] = column_meta
-                meta.clustering_key.append(meta.columns[r.get('column_name')])
+                meta.clustering_key.append(column_meta)
 
-        for col_row in (r for r in col_rows
-                        if r.get('kind', None) not in ('partition_key', 'clustering_key')):
+        # remaining columns (static, regular, etc.)
+        for col_row in other_rows:
             column_meta = self._build_column_metadata(meta, col_row)
             if is_dense and column_meta.cql_type == types.cql_empty_type:
                 continue