Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 84 additions & 85 deletions bertopic/_bertopic.py

Large diffs are not rendered by default.

11 changes: 4 additions & 7 deletions bertopic/_save_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@
except ImportError:
_has_hf_hub = False

# Typing
from typing import Union

# Pytorch check
try:
import torch
Expand Down Expand Up @@ -113,7 +110,7 @@ def push_to_hf_hub(
create_pr: bool = False,
model_card: bool = True,
serialization: str = "safetensors",
save_embedding_model: Union[str, bool] = True,
save_embedding_model: str | bool = True,
save_ctfidf: bool = False,
):
"""Push your BERTopic model to a HuggingFace Hub.
Expand Down Expand Up @@ -450,9 +447,9 @@ def save_topics(model, path: str):
json.dump(topics, f, indent=2, cls=NumpyEncoder)


def load_cfg_from_json(json_file: Union[str, os.PathLike]):
def load_cfg_from_json(json_file: str | os.PathLike):
"""Load configuration from json."""
with open(json_file, "r", encoding="utf-8") as reader:
with open(json_file, encoding="utf-8") as reader:
text = reader.read()
return json.loads(text)

Expand All @@ -463,7 +460,7 @@ def default(self, obj):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
return super(NumpyEncoder, self).default(obj)
return super().default(obj)


def get_package_versions():
Expand Down
12 changes: 6 additions & 6 deletions bertopic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections.abc import Iterable
from scipy.sparse import csr_matrix
from scipy.spatial.distance import squareform
from typing import Optional, Union, Tuple, Any
from typing import Any


class MyLogger:
Expand Down Expand Up @@ -142,7 +142,7 @@ def validate_distance_matrix(X, n_samples):
"distance matrix of shape (n*(n-1)/2,) or a "
"2-D square distance matrix of shape (n, n)."
"where n is the number of documents."
"Got a distance matrix of shape %s" % str(s)
f"Got a distance matrix of shape {s}"
)

# Make sure its entries are non-negative
Expand Down Expand Up @@ -177,11 +177,11 @@ def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array:


def select_topic_representation(
ctfidf_embeddings: Optional[Union[np.ndarray, csr_matrix]] = None,
embeddings: Optional[Union[np.ndarray, csr_matrix]] = None,
ctfidf_embeddings: np.ndarray | csr_matrix | None = None,
embeddings: np.ndarray | csr_matrix | None = None,
use_ctfidf: bool = True,
output_ndarray: bool = False,
) -> Tuple[np.ndarray, bool]:
) -> tuple[np.ndarray, bool]:
"""Select the topic representation.

Arguments:
Expand All @@ -199,7 +199,7 @@ def select_topic_representation(
The selected topic representation and a boolean indicating whether it is c-TF-IDF.
"""

def to_ndarray(array: Union[np.ndarray, csr_matrix]) -> np.ndarray:
def to_ndarray(array: np.ndarray | csr_matrix) -> np.ndarray:
if isinstance(array, csr_matrix):
return array.toarray()
return array
Expand Down
7 changes: 3 additions & 4 deletions bertopic/backend/_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
from typing import List


class BaseEmbedder:
Expand All @@ -18,7 +17,7 @@ def __init__(self, embedding_model=None, word_embedding_model=None):
self.embedding_model = embedding_model
self.word_embedding_model = word_embedding_model

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand All @@ -32,7 +31,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""
pass

def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
def embed_words(self, words: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.

Expand All @@ -47,7 +46,7 @@ def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
"""
return self.embed(words, verbose)

def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
def embed_documents(self, document: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.

Expand Down
5 changes: 3 additions & 2 deletions bertopic/backend/_cohere.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import time
import numpy as np
from tqdm import tqdm
from typing import Any, List, Mapping
from typing import Any
from collections.abc import Mapping
from bertopic.backend import BaseEmbedder


Expand Down Expand Up @@ -60,7 +61,7 @@ def __init__(
else:
self.embed_kwargs["model"] = self.embedding_model

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
3 changes: 1 addition & 2 deletions bertopic/backend/_fastembed.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
from typing import List
from fastembed import TextEmbedding

from bertopic.backend import BaseEmbedder
Expand Down Expand Up @@ -38,7 +37,7 @@ def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"):
"The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
5 changes: 2 additions & 3 deletions bertopic/backend/_flair.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
from tqdm import tqdm
from typing import Union, List
from flair.data import Sentence
from flair.embeddings import DocumentEmbeddings, TokenEmbeddings, DocumentPoolEmbeddings

Expand Down Expand Up @@ -30,7 +29,7 @@ class FlairBackend(BaseEmbedder):
```
"""

def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]):
def __init__(self, embedding_model: TokenEmbeddings | DocumentEmbeddings):
super().__init__()

# Flair word embeddings
Expand All @@ -52,7 +51,7 @@ def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]):
"`roberta = TransformerDocumentEmbeddings('roberta-base')`"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
3 changes: 1 addition & 2 deletions bertopic/backend/_gensim.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
from tqdm import tqdm
from typing import List
from bertopic.backend import BaseEmbedder
from gensim.models.keyedvectors import Word2VecKeyedVectors

Expand Down Expand Up @@ -36,7 +35,7 @@ def __init__(self, embedding_model: Word2VecKeyedVectors):
"`ft = api.load('fasttext-wiki-news-subwords-300')`"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
3 changes: 1 addition & 2 deletions bertopic/backend/_hftransformers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np

from tqdm import tqdm
from typing import List
from torch.utils.data import Dataset
from sklearn.preprocessing import normalize
from transformers.pipelines import Pipeline
Expand Down Expand Up @@ -42,7 +41,7 @@ def __init__(self, embedding_model: Pipeline):
"pipeline('feature-extraction', model='distilbert-base-cased', device=0)"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
4 changes: 1 addition & 3 deletions bertopic/backend/_langchain.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import List

import numpy as np
from bertopic.backend import BaseEmbedder
from langchain_core.embeddings import Embeddings
Expand All @@ -25,7 +23,7 @@ class LangChainBackend(BaseEmbedder):
def __init__(self, embedding_model: Embeddings):
self.embedding_model = embedding_model

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
5 changes: 2 additions & 3 deletions bertopic/backend/_model2vec.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
from typing import List, Union
from model2vec import StaticModel
from sklearn.feature_extraction.text import CountVectorizer

Expand Down Expand Up @@ -53,7 +52,7 @@ class Model2VecBackend(BaseEmbedder):

def __init__(
self,
embedding_model: Union[str, StaticModel],
embedding_model: str | StaticModel,
distill: bool = False,
distill_kwargs: dict = {},
distill_vectorizer: str | None = None,
Expand Down Expand Up @@ -87,7 +86,7 @@ def __init__(
"`model = StaticModel.from_pretrained('minishlab/potion-base-8M')`"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
11 changes: 5 additions & 6 deletions bertopic/backend/_multimodal.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import numpy as np
from PIL import Image
from tqdm import tqdm
from typing import List, Union
from sentence_transformers import SentenceTransformer

from bertopic.backend import BaseEmbedder
Expand Down Expand Up @@ -44,8 +43,8 @@ class MultiModalBackend(BaseEmbedder):

def __init__(
self,
embedding_model: Union[str, SentenceTransformer],
image_model: Union[str, SentenceTransformer] = None,
embedding_model: str | SentenceTransformer,
image_model: str | SentenceTransformer = None,
batch_size: int = 32,
):
super().__init__()
Expand Down Expand Up @@ -84,7 +83,7 @@ def __init__(
except: # noqa: E722
self.tokenizer = None

def embed(self, documents: List[str], images: List[str] | None = None, verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], images: list[str] | None = None, verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words or images into an n-dimensional
matrix of embeddings.

Expand Down Expand Up @@ -122,7 +121,7 @@ def embed(self, documents: List[str], images: List[str] | None = None, verbose:
elif image_embeddings is not None:
return image_embeddings

def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed_documents(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand All @@ -138,7 +137,7 @@ def embed_documents(self, documents: List[str], verbose: bool = False) -> np.nda
embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
return embeddings

def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
def embed_words(self, words: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.

Expand Down
5 changes: 3 additions & 2 deletions bertopic/backend/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import openai
import numpy as np
from tqdm import tqdm
from typing import List, Mapping, Any
from typing import Any
from collections.abc import Mapping
from bertopic.backend import BaseEmbedder


Expand Down Expand Up @@ -51,7 +52,7 @@ def __init__(
elif not self.generator_kwargs.get("engine"):
self.generator_kwargs["model"] = self.embedding_model

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
5 changes: 2 additions & 3 deletions bertopic/backend/_sentencetransformers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
from typing import List, Union
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding

Expand Down Expand Up @@ -50,7 +49,7 @@ class SentenceTransformerBackend(BaseEmbedder):
```
"""

def __init__(self, embedding_model: Union[str, SentenceTransformer], model2vec: bool = False):
def __init__(self, embedding_model: str | SentenceTransformer, model2vec: bool = False):
super().__init__()

self._hf_model = None
Expand All @@ -69,7 +68,7 @@ def __init__(self, embedding_model: Union[str, SentenceTransformer], model2vec:
"`model = SentenceTransformer('all-MiniLM-L6-v2')`"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
3 changes: 1 addition & 2 deletions bertopic/backend/_spacy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
from tqdm import tqdm
from typing import List
from bertopic.backend import BaseEmbedder


Expand Down Expand Up @@ -61,7 +60,7 @@ def __init__(self, embedding_model):
"or create a nlp model using: `nlp = spacy.load('en_core_web_md')"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
3 changes: 1 addition & 2 deletions bertopic/backend/_use.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
from tqdm import tqdm
from typing import List

from bertopic.backend import BaseEmbedder

Expand Down Expand Up @@ -37,7 +36,7 @@ def __init__(self, embedding_model):
"`embedding_model = tensorflow_hub.load(path_to_model)`"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
def embed(self, documents: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Expand Down
5 changes: 2 additions & 3 deletions bertopic/backend/_word_doc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
from typing import List
from bertopic.backend._base import BaseEmbedder
from bertopic.backend._utils import select_backend

Expand All @@ -13,7 +12,7 @@ def __init__(self, embedding_model, word_embedding_model):
self.embedding_model = select_backend(embedding_model)
self.word_embedding_model = select_backend(word_embedding_model)

def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
def embed_words(self, words: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.

Expand All @@ -28,7 +27,7 @@ def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
"""
return self.word_embedding_model.embed(words, verbose)

def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
def embed_documents(self, document: list[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n words into an n-dimensional
matrix of embeddings.

Expand Down
Loading