diff --git a/FlagEmbedding/abc/evaluation/arguments.py b/FlagEmbedding/abc/evaluation/arguments.py index c7795cbc0..a8da21ebf 100644 --- a/FlagEmbedding/abc/evaluation/arguments.py +++ b/FlagEmbedding/abc/evaluation/arguments.py @@ -87,7 +87,7 @@ class AbsEvalModelArgs: metadata={"help": "The embedder name or path.", "required": True} ) embedder_model_class: Optional[str] = field( - default=None, metadata={"help": "The embedder model class. Available classes: ['encoder-only-base', 'encoder-only-m3', 'decoder-only-base', 'decoder-only-icl']. Default: None. For the custom model, you need to specifiy the model class.", "choices": ["encoder-only-base", "encoder-only-m3", "decoder-only-base", "decoder-only-icl"]} + default=None, metadata={"help": "The embedder model class. Available classes: ['encoder-only-base', 'encoder-only-m3', 'decoder-only-base', 'decoder-only-icl', 'decoder-only-pseudo_moe']. Default: None. For the custom model, you need to specifiy the model class.", "choices": ["encoder-only-base", "encoder-only-m3", "decoder-only-base", "decoder-only-icl", "decoder-only-pseudo_moe"]} ) normalize_embeddings: bool = field( default=True, metadata={"help": "whether to normalize the embeddings"} @@ -143,6 +143,9 @@ class AbsEvalModelArgs: cache_dir: str = field( default=None, metadata={"help": "Cache directory for models."} ) + domain_for_pseudo_moe: Optional[str] = field( + default=None, metadata={"help": "Domain used by decoder-only-pseudo_moe model, e.g. general/coding/reasoning."} + ) # ================ for inference =============== embedder_batch_size: int = field( default=3000, metadata={"help": "Batch size for inference."} diff --git a/FlagEmbedding/abc/evaluation/runner.py b/FlagEmbedding/abc/evaluation/runner.py index be2aa98ac..bb58ab26f 100644 --- a/FlagEmbedding/abc/evaluation/runner.py +++ b/FlagEmbedding/abc/evaluation/runner.py @@ -58,6 +58,7 @@ def get_models(model_args: AbsEvalModelArgs) -> Tuple[AbsEmbedder, Union[AbsRera examples_instruction_format=model_args.examples_instruction_format, trust_remote_code=model_args.trust_remote_code, cache_dir=model_args.cache_dir, + domain_for_pseudo_moe=model_args.domain_for_pseudo_moe, batch_size=model_args.embedder_batch_size, query_max_length=model_args.embedder_query_max_length, passage_max_length=model_args.embedder_passage_max_length, diff --git a/FlagEmbedding/inference/__init__.py b/FlagEmbedding/inference/__init__.py index a81c9b4cc..ef33b9343 100644 --- a/FlagEmbedding/inference/__init__.py +++ b/FlagEmbedding/inference/__init__.py @@ -2,7 +2,7 @@ from .auto_reranker import FlagAutoReranker from .embedder import ( FlagModel, BGEM3FlagModel, - FlagICLModel, FlagLLMModel, + FlagICLModel, FlagLLMModel, FlagPseudoMoEModel, EmbedderModelClass ) from .reranker import ( @@ -21,6 +21,7 @@ "BGEM3FlagModel", "FlagICLModel", "FlagLLMModel", + "FlagPseudoMoEModel", "FlagReranker", "FlagLLMReranker", "LayerWiseFlagLLMReranker", diff --git a/FlagEmbedding/inference/embedder/__init__.py b/FlagEmbedding/inference/embedder/__init__.py index ca95bd993..65e5d9596 100644 --- a/FlagEmbedding/inference/embedder/__init__.py +++ b/FlagEmbedding/inference/embedder/__init__.py @@ -1,5 +1,5 @@ from .encoder_only import FlagModel, BGEM3FlagModel -from .decoder_only import FlagICLModel, FlagLLMModel +from .decoder_only import FlagICLModel, FlagLLMModel, FlagPseudoMoEModel from .model_mapping import EmbedderModelClass __all__ = [ @@ -7,5 +7,6 @@ "BGEM3FlagModel", "FlagICLModel", "FlagLLMModel", + "FlagPseudoMoEModel", "EmbedderModelClass", ] diff --git a/FlagEmbedding/inference/embedder/decoder_only/__init__.py b/FlagEmbedding/inference/embedder/decoder_only/__init__.py index af5c9fedc..5ad57165c 100644 --- a/FlagEmbedding/inference/embedder/decoder_only/__init__.py +++ b/FlagEmbedding/inference/embedder/decoder_only/__init__.py @@ -1,7 +1,9 @@ from .base import BaseLLMEmbedder as FlagLLMModel from .icl import ICLLLMEmbedder as FlagICLModel +from .pseudo_moe import PseudoMoELLMEmbedder as FlagPseudoMoEModel __all__ = [ "FlagLLMModel", "FlagICLModel", + "FlagPseudoMoEModel", ] diff --git a/FlagEmbedding/inference/embedder/decoder_only/pseudo_moe.py b/FlagEmbedding/inference/embedder/decoder_only/pseudo_moe.py new file mode 100644 index 000000000..5fe0b8e90 --- /dev/null +++ b/FlagEmbedding/inference/embedder/decoder_only/pseudo_moe.py @@ -0,0 +1,194 @@ +from typing import cast, Any, List, Union, Optional + +import torch +import numpy as np + +from .base import BaseLLMEmbedder, last_token_pool + + +class PseudoMoELLMEmbedder(BaseLLMEmbedder): + """Decoder-only embedder for pseudo MoE checkpoints. + + This class follows the same behavior as :class:`BaseLLMEmbedder`, but supports + selecting an active domain (e.g. ``general``, ``coding``, ``reasoning``) during + inference when the underlying model implements domain routing. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`True`. + query_instruction_for_retrieval (Optional[str], optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`None`. + query_instruction_format (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"Instruct: {}\nQuery: {}"`. + devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`. + trust_remote_code (bool, optional): trust_remote_code for HF datasets or models. Defaults to :data:`False`. + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`256`. + query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`. + passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. + Defaults to :data:`True`. + domain_for_pseudo_moe (str, optional): Specifies the active domain for the decoder-only pseudo-MoE model (e.g., "general", "coding", or "reasoning"). + Defaults to "general". + + Attributes: + DEFAULT_POOLING_METHOD: The default pooling method when running the model. + """ + DEFAULT_POOLING_METHOD = "last_token" + + def __init__( + self, + model_name_or_path: str, + normalize_embeddings: bool = True, + use_fp16: bool = False, + use_bf16: bool = True, + query_instruction_for_retrieval: Optional[str] = None, + query_instruction_format: str = "Instruct: {}\nQuery: {}", + devices: Optional[Union[str, List[str]]] = None, + trust_remote_code: bool = True, + cache_dir: Optional[str] = None, + batch_size: int = 256, + query_max_length: int = 512, + passage_max_length: int = 512, + convert_to_numpy: bool = True, + truncate_dim: Optional[int] = None, + domain_for_pseudo_moe: Optional[str] = None, + **kwargs: Any, + ): + self.domain_for_pseudo_moe = domain_for_pseudo_moe + super().__init__( + model_name_or_path=model_name_or_path, + normalize_embeddings=normalize_embeddings, + use_fp16=use_fp16, + use_bf16=use_bf16, + query_instruction_for_retrieval=query_instruction_for_retrieval, + query_instruction_format=query_instruction_format, + devices=devices, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir, + batch_size=batch_size, + query_max_length=query_max_length, + passage_max_length=passage_max_length, + convert_to_numpy=convert_to_numpy, + truncate_dim=truncate_dim, + **kwargs, + ) + + def _resolve_domain(self, kwargs: Any) -> Optional[str]: + domain = kwargs.pop("domain_for_pseudo_moe", None) + if domain is None: + domain = kwargs.pop("domain", None) + if domain is None: + domain = self.domain_for_pseudo_moe + return domain + + @torch.no_grad() + def encode_single_device( + self, + sentences: Union[List[str], str], + batch_size: int = 256, + max_length: int = 512, + convert_to_numpy: bool = True, + device: Optional[str] = None, + **kwargs: Any + ): + if device is None: + device = self.target_devices[0] + + if device == "cpu": + self.model.float() + + self.model.to(device) + self.model.eval() + + input_was_string = False + if isinstance(sentences, str): + sentences = [sentences] + input_was_string = True + + domain = self._resolve_domain(kwargs) + if domain is not None and hasattr(self.model, "set_domain"): + self.model.set_domain(domain) + + model_forward_kwargs = {"return_dict": True} + if domain is not None: + model_forward_kwargs["domain"] = domain + + # tokenize without padding to get the correct length + all_inputs = [] + for start_index in range(0, len(sentences), batch_size): + sentences_batch = sentences[start_index:start_index + batch_size] + inputs_batch = self.tokenizer( + sentences_batch, + truncation=True, + max_length=max_length, + **kwargs + ) + inputs_batch = [{ + k: inputs_batch[k][i] for k in inputs_batch.keys() + } for i in range(len(sentences_batch))] + all_inputs.extend(inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) for x in all_inputs]) + all_inputs_sorted = [all_inputs[i] for i in length_sorted_idx] + + # adjust batch size + flag = False + while flag is False: + try: + inputs_batch = self.tokenizer.pad( + all_inputs_sorted[: batch_size], + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + try: + last_hidden_state = self.model(**inputs_batch, **model_forward_kwargs).last_hidden_state + except TypeError: + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + _ = last_token_pool(last_hidden_state, inputs_batch['attention_mask']) + flag = True + except RuntimeError: + batch_size = batch_size * 3 // 4 + except torch.cuda.OutOfMemoryError: + batch_size = batch_size * 3 // 4 + + # encode + all_embeddings = [] + for start_index in range(0, len(sentences), batch_size): + inputs_batch = all_inputs_sorted[start_index:start_index + batch_size] + inputs_batch = self.tokenizer.pad( + inputs_batch, + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + try: + last_hidden_state = self.model(**inputs_batch, **model_forward_kwargs).last_hidden_state + except TypeError: + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = last_token_pool(last_hidden_state, inputs_batch['attention_mask']) + embeddings = self._truncate_embeddings(embeddings) + embeddings = torch.nan_to_num(embeddings, nan=0.0, posinf=1e4, neginf=-1e4) + if self.normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings.float(), dim=-1) + embeddings = cast(torch.Tensor, embeddings) + + if convert_to_numpy: + embeddings = self._convert_to_numpy(embeddings, device=device) + all_embeddings.append(embeddings) + + if convert_to_numpy: + all_embeddings = np.concatenate(all_embeddings, axis=0) + else: + all_embeddings = torch.cat(all_embeddings, dim=0) + + # adjust the order of embeddings + all_embeddings = all_embeddings[np.argsort(length_sorted_idx)] + + if input_was_string: + return all_embeddings[0] + return all_embeddings diff --git a/FlagEmbedding/inference/embedder/model_mapping.py b/FlagEmbedding/inference/embedder/model_mapping.py index e676401c0..3cc3c31bc 100644 --- a/FlagEmbedding/inference/embedder/model_mapping.py +++ b/FlagEmbedding/inference/embedder/model_mapping.py @@ -4,7 +4,7 @@ from collections import OrderedDict from FlagEmbedding.abc.inference import AbsEmbedder -from FlagEmbedding.inference.embedder import FlagModel, BGEM3FlagModel, FlagLLMModel, FlagICLModel +from FlagEmbedding.inference.embedder import FlagModel, BGEM3FlagModel, FlagLLMModel, FlagICLModel, FlagPseudoMoEModel class EmbedderModelClass(Enum): @@ -12,13 +12,15 @@ class EmbedderModelClass(Enum): ENCODER_ONLY_M3 = "encoder-only-m3" DECODER_ONLY_BASE = "decoder-only-base" DECODER_ONLY_ICL = "decoder-only-icl" + DECODER_ONLY_PSEUDO_MOE = "decoder-only-pseudo_moe" EMBEDDER_CLASS_MAPPING = OrderedDict([ (EmbedderModelClass.ENCODER_ONLY_BASE, FlagModel), (EmbedderModelClass.ENCODER_ONLY_M3, BGEM3FlagModel), (EmbedderModelClass.DECODER_ONLY_BASE, FlagLLMModel), - (EmbedderModelClass.DECODER_ONLY_ICL, FlagICLModel) + (EmbedderModelClass.DECODER_ONLY_ICL, FlagICLModel), + (EmbedderModelClass.DECODER_ONLY_PSEUDO_MOE, FlagPseudoMoEModel) ]) diff --git a/examples/inference/embedder/decoder_only/auto_pseudo_moe_multi_devices.py b/examples/inference/embedder/decoder_only/auto_pseudo_moe_multi_devices.py new file mode 100644 index 000000000..3a0386cef --- /dev/null +++ b/examples/inference/embedder/decoder_only/auto_pseudo_moe_multi_devices.py @@ -0,0 +1,42 @@ +import os +from FlagEmbedding import FlagAutoModel + + +def test_auto_pseudo_moe_multi_devices(): + model_name_or_path = "geevec-ai/geevec-embeddings-1.0-lite" + + model = FlagAutoModel.from_finetuned( + model_name_or_path, + model_class="decoder-only-pseudo_moe", + query_instruction_for_retrieval="Given a question, retrieve passages that answer the question.", + query_instruction_format="Instruct: {}\nQuery: {}", + domain_for_pseudo_moe="coding", + use_fp16=False, + use_bf16=True, + trust_remote_code=True, + devices=["cuda:0", "cuda:1"], # if you don't have GPUs, you can use ["cpu", "cpu"] + cache_dir=os.getenv("HF_HUB_CACHE", None), + ) + + queries = [ + "how much protein should a female eat", + "summit define", + ] * 100 + passages = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day.", + "Definition of summit for English Language Learners: the highest point of a mountain; the highest level; a meeting between leaders.", + ] * 100 + + queries_embeddings = model.encode_queries(queries) + passages_embeddings = model.encode_corpus(passages) + + cos_scores = queries_embeddings @ passages_embeddings.T + print(cos_scores[:2, :2]) + + +if __name__ == "__main__": + test_auto_pseudo_moe_multi_devices() + + print("--------------------------------") + print("Expected Output:") + print("[[0.700 0.246]\n [0.158 0.654]]") diff --git a/examples/inference/embedder/decoder_only/auto_pseudo_moe_single_device.py b/examples/inference/embedder/decoder_only/auto_pseudo_moe_single_device.py new file mode 100644 index 000000000..520a7cc90 --- /dev/null +++ b/examples/inference/embedder/decoder_only/auto_pseudo_moe_single_device.py @@ -0,0 +1,42 @@ +import os +from FlagEmbedding import FlagAutoModel + + +def test_auto_pseudo_moe_single_device(): + model_name_or_path = "geevec-ai/geevec-embeddings-1.0-lite" + + model = FlagAutoModel.from_finetuned( + model_name_or_path, + model_class="decoder-only-pseudo_moe", + query_instruction_for_retrieval="Given a question, retrieve passages that answer the question.", + query_instruction_format="Instruct: {}\nQuery: {}", + domain_for_pseudo_moe="reasoning", + use_fp16=False, + use_bf16=True, + trust_remote_code=True, + devices="cuda:0", # if you don't have a GPU, you can use "cpu" + cache_dir=os.getenv("HF_HUB_CACHE", None), + ) + + queries = [ + "how much protein should a female eat", + "summit define", + ] * 10 + passages = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day.", + "Definition of summit for English Language Learners: the highest point of a mountain; the highest level; a meeting between leaders.", + ] * 10 + + queries_embeddings = model.encode_queries(queries) + passages_embeddings = model.encode_corpus(passages) + + cos_scores = queries_embeddings @ passages_embeddings.T + print(cos_scores[:2, :2]) + + +if __name__ == "__main__": + test_auto_pseudo_moe_single_device() + + print("--------------------------------") + print("Expected Output:") + print("[[0.844 0.466 ]\n [0.395 0.684 ]]") diff --git a/examples/inference/embedder/decoder_only/pseudo_moe_multi_devices.py b/examples/inference/embedder/decoder_only/pseudo_moe_multi_devices.py new file mode 100644 index 000000000..652be41b8 --- /dev/null +++ b/examples/inference/embedder/decoder_only/pseudo_moe_multi_devices.py @@ -0,0 +1,41 @@ +import os +from FlagEmbedding import FlagPseudoMoEModel + + +def test_pseudo_moe_multi_devices(): + model_name_or_path = "geevec-ai/geevec-embeddings-1.0-lite" + + model = FlagPseudoMoEModel( + model_name_or_path, + query_instruction_for_retrieval="Given a question, retrieve passages that answer the question.", + query_instruction_format="Instruct: {}\nQuery: {}", + domain_for_pseudo_moe="reasoning", + use_fp16=False, + use_bf16=True, + trust_remote_code=True, + devices=["cuda:0", "cuda:1"], # if you don't have GPUs, you can use ["cpu", "cpu"] + cache_dir=os.getenv("HF_HUB_CACHE", None), + ) + + queries = [ + "how much protein should a female eat", + "summit define", + ] * 100 + passages = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day.", + "Definition of summit for English Language Learners: the highest point of a mountain; the highest level; a meeting between leaders.", + ] * 100 + + queries_embeddings = model.encode_queries(queries) + passages_embeddings = model.encode_corpus(passages) + + cos_scores = queries_embeddings @ passages_embeddings.T + print(cos_scores[:2, :2]) + + +if __name__ == "__main__": + test_pseudo_moe_multi_devices() + + print("--------------------------------") + print("Expected Output:") + print("[[0.844 0.466 ]\n [0.395 0.684 ]]") diff --git a/examples/inference/embedder/decoder_only/pseudo_moe_single_device.py b/examples/inference/embedder/decoder_only/pseudo_moe_single_device.py new file mode 100644 index 000000000..ddd021257 --- /dev/null +++ b/examples/inference/embedder/decoder_only/pseudo_moe_single_device.py @@ -0,0 +1,41 @@ +import os +from FlagEmbedding import FlagPseudoMoEModel + + +def test_pseudo_moe_single_device(): + model_name_or_path = "geevec-ai/geevec-embeddings-1.0-lite" + + model = FlagPseudoMoEModel( + model_name_or_path, + query_instruction_for_retrieval="Given a question, retrieve passages that answer the question.", + query_instruction_format="Instruct: {}\nQuery: {}", + domain_for_pseudo_moe="coding", + use_fp16=False, + use_bf16=True, + trust_remote_code=True, + devices="cuda:0", # if you don't have a GPU, you can use "cpu" + cache_dir=os.getenv("HF_HUB_CACHE", None), + ) + + queries = [ + "how much protein should a female eat", + "summit define", + ] * 10 + passages = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day.", + "Definition of summit for English Language Learners: the highest point of a mountain; the highest level; a meeting between leaders.", + ] * 10 + + queries_embeddings = model.encode_queries(queries) + passages_embeddings = model.encode_corpus(passages) + + cos_scores = queries_embeddings @ passages_embeddings.T + print(cos_scores[:2, :2]) + + +if __name__ == "__main__": + test_pseudo_moe_single_device() + + print("--------------------------------") + print("Expected Output:") + print("[[0.700 0.246]\n [0.158 0.654]]") diff --git a/research/BGE_Coder/evaluation/coir_eval/main.py b/research/BGE_Coder/evaluation/coir_eval/main.py index c6af8540c..d9a8f678a 100644 --- a/research/BGE_Coder/evaluation/coir_eval/main.py +++ b/research/BGE_Coder/evaluation/coir_eval/main.py @@ -5,7 +5,7 @@ from arguments import COIREvalArgs, COIREvalModelArgs from prompts import get_task_def_by_task_name -from FlagEmbedding import FlagLLMModel, FlagModel +from FlagEmbedding import FlagLLMModel, FlagModel, FlagPseudoMoEModel def get_model(model_args: COIREvalModelArgs): @@ -45,6 +45,25 @@ def get_model(model_args: COIREvalModelArgs): query_max_length=model_args.embedder_query_max_length, passage_max_length=model_args.embedder_passage_max_length, ) + elif model_args.embedder_model_class == "decoder-only-pseudo_moe": + embedder = FlagPseudoMoEModel( + model_name_or_path=embedder_name_or_path, + normalize_embeddings=model_args.normalize_embeddings, + pooling_method=model_args.pooling_method, + use_fp16=model_args.use_fp16, + use_bf16=model_args.use_bf16, + query_instruction_for_retrieval=model_args.query_instruction_for_retrieval, + query_instruction_format=model_args.query_instruction_format_for_retrieval, + devices=model_args.devices, + examples_for_task=model_args.examples_for_task, + examples_instruction_format=model_args.examples_instruction_format, + trust_remote_code=model_args.trust_remote_code, + cache_dir=model_args.cache_dir, + batch_size=model_args.embedder_batch_size, + query_max_length=model_args.embedder_query_max_length, + passage_max_length=model_args.embedder_passage_max_length, + domain_for_pseudo_moe=model_args.domain_for_pseudo_moe, + ) else: raise ValueError(f"Invalid model class: {model_args.embedder_model_class}") embedder.model.config._name_or_path = model_args.embedder_name_or_path