fix: Handle duplicate texts correctly in embed_stream

fede-kamel · fede-kamel · commit 73545e56cf8f · 2026-01-25T21:01:27.000-05:00
Addresses Copilot review comment: Duplicate texts cause incorrect embedding
index assignment.

Previously, when batch_texts contained duplicate texts, all embeddings for
those duplicates would be assigned the same index (the index of the first
occurrence) because list.index() always returns the first match.

Now tracks used indices and assigns each embedding to the next unused
occurrence of its text in the batch, ensuring correct index assignment
even with duplicate texts.

Example:
  texts = ['hello', 'world', 'hello']
  Before: indices would be [0, 1, 0] - WRONG
  After:  indices are [0, 1, 2] - CORRECT
diff --git a/src/cohere/base_client.py b/src/cohere/base_client.py
@@ -1209,7 +1209,7 @@ def embed_stream(
         for batch_start in range(0, len(texts_list), batch_size):
             batch_end = min(batch_start + batch_size, len(texts_list))
             batch_texts = texts_list[batch_start:batch_end]
-            
+
             # Get response for this batch
             response = self._raw_client.embed(
                 texts=batch_texts,
@@ -1219,15 +1219,27 @@ def embed_stream(
                 truncate=truncate,
                 request_options=request_options,
             )
-            
+
             # Parse embeddings from response incrementally
             parser = StreamingEmbedParser(response._response, batch_texts)
+            # Track used indices to handle duplicate texts correctly
+            used_batch_indices = set()
+
             for embedding in parser.iter_embeddings():
                 # The parser sets embedding.text correctly for multiple embedding types
                 # Adjust the global index based on text position in batch
                 if embedding.text and embedding.text in batch_texts:
-                    text_idx_in_batch = batch_texts.index(embedding.text)
-                    embedding.index = batch_start + text_idx_in_batch
+                    # Find the next unused occurrence of this text in the batch
+                    # This handles duplicate texts correctly
+                    text_idx_in_batch = None
+                    for idx, text in enumerate(batch_texts):
+                        if text == embedding.text and idx not in used_batch_indices:
+                            text_idx_in_batch = idx
+                            used_batch_indices.add(idx)
+                            break
+
+                    if text_idx_in_batch is not None:
+                        embedding.index = batch_start + text_idx_in_batch
                 yield embedding
 
     def rerank(