Added C-Top2Vec

x-tabdeveloping · x-tabdeveloping · commit 2146f68f76fa · 2026-04-02T12:38:42.000+02:00
diff --git a/turftopic/__init__.py b/turftopic/__init__.py
@@ -2,7 +2,12 @@
 from turftopic._datamapplot import build_datamapplot
 from turftopic.base import ContextualModel
 from turftopic.error import NotInstalled
-from turftopic.models.cluster import BERTopic, ClusteringTopicModel, Top2Vec
+from turftopic.models.cluster import (
+    BERTopic,
+    ClusteringTopicModel,
+    CTop2Vec,
+    Top2Vec,
+)
 from turftopic.models.cvp import ConceptVectorProjection
 from turftopic.models.decomp import S3, SemanticSignalSeparation
 from turftopic.models.fastopic import FASTopic
@@ -29,6 +34,7 @@
     "ContextualModel",
     "FASTopic",
     "Top2Vec",
+    "CTop2Vec",
     "BERTopic",
     "load_model",
     "build_datamapplot",
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -5,7 +5,7 @@
 import webbrowser
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Iterable, Literal, Optional, Sequence, Union
+from typing import Any, Callable, Iterable, Literal, Optional, Sequence, Union
 
 import numpy as np
 from rich.console import Console
@@ -30,6 +30,7 @@
     npmi,
     soft_ctf_idf,
 )
+from turftopic.late import LateSentenceTransformer, LateWrapper
 from turftopic.models._hierarchical_clusters import (
     VALID_LINKAGE_METHODS,
     ClusterNode,
@@ -43,6 +44,7 @@
 )
 from turftopic.types import VALID_DISTANCE_METRICS, DistanceMetric
 from turftopic.utils import safe_binarize
+from turftopic.vectorizers import PhraseVectorizer
 from turftopic.vectorizers.default import default_vectorizer
 
 integer_message = """
@@ -865,3 +867,96 @@ def __init__(
             reduction_distance_metric=reduction_distance_metric,
             reduction_topic_representation=reduction_topic_representation,
         )
+
+
+class CTop2Vec(LateWrapper):
+    """Convenience function to construct a CTop2Vec model in Turftopic.
+    The model is essentially the same as ClusteringTopicModel in a Late Wrapper
+    with defaults that resemble CTop2Vec. This includes:
+
+    1. A late interaction embedding model, with windowed aggregation
+    2. UMAP reduction
+    3. HDBSCAN clustering
+    4. Centroid term importance
+    5. Phrase vectorizer
+
+    ```bash
+    pip install turftopic[umap-learn]
+    ```
+
+    ```python
+    from turftopic import CTop2Vec
+
+    corpus: list[str] = ["some text", "more text", ...]
+
+    model = CTop2Vec().fit(corpus)
+    model.print_topics()
+    ```
+    """
+
+    def __init__(
+        self,
+        encoder: Union[
+            Encoder, str, MultimodalEncoder
+        ] = "sentence-transformers/all-MiniLM-L6-v2",
+        vectorizer: Optional[CountVectorizer] = None,
+        dimensionality_reduction: Optional[TransformerMixin] = None,
+        clustering: Optional[ClusterMixin] = None,
+        feature_importance: WordImportance = "centroid",
+        n_reduce_to: Optional[int] = None,
+        reduction_method: LinkageMethod = "smallest",
+        reduction_distance_metric: DistanceMetric = "cosine",
+        reduction_topic_representation: TopicRepresentation = "centroid",
+        window_size: Optional[int] = 50,
+        step_size: Optional[int] = 40,
+        pooling: Optional[Callable] = np.mean,
+        random_state: Optional[int] = None,
+    ):
+        if dimensionality_reduction is None:
+            try:
+                from umap import UMAP
+            except ModuleNotFoundError as e:
+                raise ModuleNotFoundError(
+                    "UMAP is not installed in your environment, but Top2Vec requires it."
+                ) from e
+            dimensionality_reduction = UMAP(
+                n_neighbors=15,
+                n_components=5,
+                min_dist=0.0,
+                metric="cosine",
+                random_state=random_state,
+            )
+        if clustering is None:
+            clustering = HDBSCAN(
+                min_cluster_size=15,
+                metric="euclidean",
+                cluster_selection_method="eom",
+            )
+        self.encoder = encoder
+        self.vectorizer = vectorizer
+        self.dimensionality_reduction = dimensionality_reduction
+        self.clustering = clustering
+        self.feature_importance = feature_importance
+        self.n_reduce_to = n_reduce_to
+        self.reduction_method = reduction_method
+        self.reduction_distance_metric = reduction_distance_metric
+        self.reduction_topic_representation = reduction_topic_representation
+        self.random_state = random_state
+        self.model = ClusteringTopicModel(
+            encoder=encoder,
+            vectorizer=vectorizer,
+            dimensionality_reduction=dimensionality_reduction,
+            clustering=clustering,
+            n_reduce_to=n_reduce_to,
+            random_state=random_state,
+            feature_importance=feature_importance,
+            reduction_method=reduction_method,
+            reduction_distance_metric=reduction_distance_metric,
+            reduction_topic_representation=reduction_topic_representation,
+        )
+        super().__init__(
+            self.model,
+            window_size=self.window_size,
+            step_size=self.step_size,
+            pooling=self.pooling,
+        )