From 474ae691a28ca1b6809616f1e2d10a934d2b2f04 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Thu, 27 Jun 2024 23:19:57 +0200 Subject: [PATCH 1/7] added model to model list --- transformer_lens/loading_from_pretrained.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index 15fc5e7b2..aa53190fc 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -193,6 +193,7 @@ "google-t5/t5-base", "google-t5/t5-large", "ai-forever/mGPT", + "baichuan-inc/Baichuan-13B-Base", ] """Official model names for models on HuggingFace.""" @@ -1204,6 +1205,22 @@ def convert_hf_model_config(model_name: str, **kwargs): "use_attn_scale": False, "tie_word_embeddings": hf_config.tie_word_embeddings, } + elif "Baichuan-13B" in official_model_name: + cfg_dict = { + "d_model": hf_config.hidden_size, + "d_head": hf_config.hidden_size // hf_config.num_attention_heads, + "n_heads": hf_config.num_attention_heads, + "d_mlp": hf_config.intermediate_size, + "n_layers": hf_config.num_hidden_layers, + "n_ctx": 2048, # Capped due to HF Tokenizer Constraints + "d_vocab": hf_config.vocab_size, + "eps": hf_config.rms_norm_eps, + "act_fn": hf_config.hidden_act, + "initializer_range": hf_config.initializer_range, + "normalization_type": "RMS", + "post_embedding_ln": True, + "positional_embedding_type": "alibi", + } else: raise NotImplementedError(f"{architecture} is not currently supported.") # All of these models use LayerNorm From 3630de18847bf7b240749466b18a1ff26924f3c7 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Fri, 28 Jun 2024 00:01:28 +0200 Subject: [PATCH 2/7] added alias --- transformer_lens/loading_from_pretrained.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index aa53190fc..13d972199 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -612,6 +612,7 @@ "google-t5/t5-base": ["t5-base"], "google-t5/t5-large": ["t5-large"], "ai-forever/mGPT": ["mGPT"], + "baichuan-inc/Baichuan-13B-Base": ["Baichuan-13B-Base"], } """Model aliases for models on HuggingFace.""" @@ -1220,6 +1221,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "normalization_type": "RMS", "post_embedding_ln": True, "positional_embedding_type": "alibi", + "tie_word_embeddings": hf_config.tie_word_embeddings, } else: raise NotImplementedError(f"{architecture} is not currently supported.") From 5ef03f628c3ec1038d137a05fa087dbdf2011ce5 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Fri, 28 Jun 2024 01:20:09 +0200 Subject: [PATCH 3/7] added baichuan weights --- transformer_lens/HookedTransformer.py | 20 +++--- transformer_lens/loading_from_pretrained.py | 8 ++- .../pretrained/weight_conversions/__init__.py | 1 + .../pretrained/weight_conversions/baichuan.py | 63 +++++++++++++++++++ 4 files changed, 82 insertions(+), 10 deletions(-) create mode 100644 transformer_lens/pretrained/weight_conversions/baichuan.py diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py index 35d32c327..d484f1236 100644 --- a/transformer_lens/HookedTransformer.py +++ b/transformer_lens/HookedTransformer.py @@ -137,19 +137,21 @@ def __init__( else: # Hugging Face defaults to use_fast to True use_fast = True - # Phi model's fast tokenizer does not support adding a BOS token, use_fast + # Phi & Baichuan model's fast tokenizer does not support adding a BOS token, use_fast # should be False - if "phi" in self.cfg.tokenizer_name.lower(): + tokenizer_name = self.cfg.tokenizer_name.lower() + if "phi" in tokenizer_name or "bai" in tokenizer_name: use_fast = False huggingface_token = os.environ.get("HF_TOKEN", None) + tokenizer = AutoTokenizer.from_pretrained( + self.cfg.tokenizer_name, + # add_bos_token=True, + trust_remote_code=self.cfg.trust_remote_code, + use_fast=use_fast, + token=huggingface_token, + ) self.set_tokenizer( - AutoTokenizer.from_pretrained( - self.cfg.tokenizer_name, - add_bos_token=True, - trust_remote_code=self.cfg.trust_remote_code, - use_fast=use_fast, - token=huggingface_token, - ), + tokenizer, default_padding_side=default_padding_side, ) else: diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index 1174b7299..ddc0f3124 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -22,6 +22,7 @@ import transformer_lens.utils as utils from transformer_lens.HookedTransformerConfig import HookedTransformerConfig from transformer_lens.pretrained.weight_conversions import ( + convert_baichuan_weights, convert_bert_weights, convert_bloom_weights, convert_coder_weights, @@ -214,6 +215,7 @@ "google-t5/t5-large", "ai-forever/mGPT", "baichuan-inc/Baichuan-13B-Base", + "baichuan-inc/Baichuan-13B-Chat", ] """Official model names for models on HuggingFace.""" @@ -633,6 +635,7 @@ "google-t5/t5-large": ["t5-large"], "ai-forever/mGPT": ["mGPT"], "baichuan-inc/Baichuan-13B-Base": ["Baichuan-13B-Base"], + "baichuan-inc/Baichuan-13B-Chat": ["Baichuan-13B-Chat"], } """Model aliases for models on HuggingFace.""" @@ -1226,7 +1229,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "use_attn_scale": False, "tie_word_embeddings": hf_config.tie_word_embeddings, } - elif "Baichuan-13B" in official_model_name: + elif architecture == "BaichuanForCausalLM": cfg_dict = { "d_model": hf_config.hidden_size, "d_head": hf_config.hidden_size // hf_config.num_attention_heads, @@ -1236,6 +1239,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "n_ctx": 2048, # Capped due to HF Tokenizer Constraints "d_vocab": hf_config.vocab_size, "eps": hf_config.rms_norm_eps, + "trust_remote_code": True, "act_fn": hf_config.hidden_act, "initializer_range": hf_config.initializer_range, "normalization_type": "RMS", @@ -1604,6 +1608,8 @@ def get_pretrained_state_dict( state_dict = convert_neox_weights(hf_model, cfg) elif cfg.original_architecture == "LlamaForCausalLM": state_dict = convert_llama_weights(hf_model, cfg) + elif cfg.original_architecture == "BaichuanForCausalLM": + state_dict = convert_baichuan_weights(hf_model, cfg) elif cfg.original_architecture == "BertForMaskedLM": state_dict = convert_bert_weights(hf_model, cfg) elif cfg.original_architecture == "T5ForConditionalGeneration": diff --git a/transformer_lens/pretrained/weight_conversions/__init__.py b/transformer_lens/pretrained/weight_conversions/__init__.py index b13850ee0..6541fde6c 100644 --- a/transformer_lens/pretrained/weight_conversions/__init__.py +++ b/transformer_lens/pretrained/weight_conversions/__init__.py @@ -7,6 +7,7 @@ from .bert import convert_bert_weights from .mistral import convert_mistral_weights from .mixtral import convert_mixtral_weights +from .baichuan import convert_baichuan_weights from .bloom import convert_bloom_weights from .coder import convert_coder_weights from .qwen import convert_qwen_weights diff --git a/transformer_lens/pretrained/weight_conversions/baichuan.py b/transformer_lens/pretrained/weight_conversions/baichuan.py new file mode 100644 index 000000000..64ed49bd0 --- /dev/null +++ b/transformer_lens/pretrained/weight_conversions/baichuan.py @@ -0,0 +1,63 @@ +import einops +import torch +from transformer_lens.HookedTransformerConfig import HookedTransformerConfig + +def convert_baichuan_weights(baichuan, cfg: HookedTransformerConfig): + state_dict = {} + + state_dict["embed.W_E"] = baichuan.model.embed_tokens.weight + + assert cfg.d_mlp is not None # keep mypy happy + + for l in range(cfg.n_layers): + state_dict[f"blocks.{l}.ln1.w"] = baichuan.model.layers[l].input_layernorm.weight + + W = baichuan.model.layers[l].self_attn.W_pack.weight + + W_split = W.T.reshape(cfg.d_model, cfg.n_heads, 3, cfg.d_head) + + W_Q, W_K, W_V = W_split[..., 0, :], W_split[..., 1, :], W_split[..., 2, :] + W_Q = einops.rearrange(W_Q, "m n h ->n m h", n=cfg.n_heads) + W_K = einops.rearrange(W_K, "m n h ->n m h", n=cfg.n_heads) + W_V = einops.rearrange(W_V, "m n h ->n m h", n=cfg.n_heads) + state_dict[f"blocks.{l}.attn.W_Q"] = W_Q + state_dict[f"blocks.{l}.attn.W_K"] = W_K + state_dict[f"blocks.{l}.attn.W_V"] = W_V + + state_dict[f"blocks.{l}.attn.b_Q"] = torch.zeros( + cfg.n_heads, cfg.d_head, dtype=cfg.dtype, device=W_Q.device + ) + state_dict[f"blocks.{l}.attn.b_K"] = torch.zeros( + cfg.n_heads, + cfg.d_head, + dtype=cfg.dtype, + device=W_Q.device, + ) + state_dict[f"blocks.{l}.attn.b_V"] = torch.zeros( + cfg.n_heads, + cfg.d_head, + dtype=cfg.dtype, + device=W_Q.device, + ) + + W_O = baichuan.model.layers[l].self_attn.o_proj.weight + W_O = einops.rearrange(W_O, "m (n h)->n h m", n=cfg.n_heads) + state_dict[f"blocks.{l}.attn.W_O"] = W_O + state_dict[f"blocks.{l}.attn.b_O"] = torch.zeros( + cfg.d_model, dtype=cfg.dtype, device=W_O.device + ) + + state_dict[f"blocks.{l}.ln2.w"] = baichuan.model.layers[l].post_attention_layernorm.weight + + state_dict[f"blocks.{l}.mlp.W_in"] = baichuan.model.layers[l].mlp.up_proj.weight.T + state_dict[f"blocks.{l}.mlp.W_gate"] = baichuan.model.layers[l].mlp.gate_proj.weight.T + state_dict[f"blocks.{l}.mlp.b_in"] = torch.zeros(cfg.d_mlp, dtype=W_O.dtype) + + state_dict[f"blocks.{l}.mlp.W_out"] = baichuan.model.layers[l].mlp.down_proj.weight.T + state_dict[f"blocks.{l}.mlp.b_out"] = torch.zeros(cfg.d_model, dtype=W_O.dtype) + + state_dict["ln_final.w"] = baichuan.model.norm.weight + state_dict["unembed.W_U"] = baichuan.lm_head.weight.T + state_dict["unembed.b_U"] = torch.zeros(cfg.d_vocab, dtype=W_O.dtype) + + return state_dict \ No newline at end of file From c4c5776d391590c4f1c53ef7da1aede68fe916d4 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Fri, 28 Jun 2024 01:43:35 +0200 Subject: [PATCH 4/7] updated architecture selection --- transformer_lens/loading_from_pretrained.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index ddc0f3124..a5b36b98a 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -214,6 +214,7 @@ "google-t5/t5-base", "google-t5/t5-large", "ai-forever/mGPT", + "baichuan-inc/Baichuan-7B", "baichuan-inc/Baichuan-13B-Base", "baichuan-inc/Baichuan-13B-Chat", ] @@ -634,6 +635,7 @@ "google-t5/t5-base": ["t5-base"], "google-t5/t5-large": ["t5-large"], "ai-forever/mGPT": ["mGPT"], + "baichuan-inc/Baichuan-7B": ["Baichuan-7B"], "baichuan-inc/Baichuan-13B-Base": ["Baichuan-13B-Base"], "baichuan-inc/Baichuan-13B-Chat": ["Baichuan-13B-Chat"], } @@ -1229,7 +1231,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "use_attn_scale": False, "tie_word_embeddings": hf_config.tie_word_embeddings, } - elif architecture == "BaichuanForCausalLM": + elif architecture.startswith("Bai"): cfg_dict = { "d_model": hf_config.hidden_size, "d_head": hf_config.hidden_size // hf_config.num_attention_heads, @@ -1608,7 +1610,7 @@ def get_pretrained_state_dict( state_dict = convert_neox_weights(hf_model, cfg) elif cfg.original_architecture == "LlamaForCausalLM": state_dict = convert_llama_weights(hf_model, cfg) - elif cfg.original_architecture == "BaichuanForCausalLM": + elif cfg.original_architecture.startswith("Bai"): state_dict = convert_baichuan_weights(hf_model, cfg) elif cfg.original_architecture == "BertForMaskedLM": state_dict = convert_bert_weights(hf_model, cfg) From beec40acf8cef2de430b4a2a3833eb6a5ea17d84 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Fri, 28 Jun 2024 02:32:51 +0200 Subject: [PATCH 5/7] removed fast flag --- transformer_lens/HookedTransformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py index d484f1236..bdc9d9533 100644 --- a/transformer_lens/HookedTransformer.py +++ b/transformer_lens/HookedTransformer.py @@ -140,7 +140,7 @@ def __init__( # Phi & Baichuan model's fast tokenizer does not support adding a BOS token, use_fast # should be False tokenizer_name = self.cfg.tokenizer_name.lower() - if "phi" in tokenizer_name or "bai" in tokenizer_name: + if "phi" in tokenizer_name: use_fast = False huggingface_token = os.environ.get("HF_TOKEN", None) tokenizer = AutoTokenizer.from_pretrained( From fad4f34ca7b92abd7bf858eb65b9113722c89240 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Sat, 6 Jul 2024 01:20:05 +0200 Subject: [PATCH 6/7] ran format --- .../pretrained/weight_conversions/baichuan.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/transformer_lens/pretrained/weight_conversions/baichuan.py b/transformer_lens/pretrained/weight_conversions/baichuan.py index 64ed49bd0..b85ef191b 100644 --- a/transformer_lens/pretrained/weight_conversions/baichuan.py +++ b/transformer_lens/pretrained/weight_conversions/baichuan.py @@ -1,12 +1,14 @@ import einops import torch -from transformer_lens.HookedTransformerConfig import HookedTransformerConfig + +from transformer_lens.HookedTransformerConfig import HookedTransformerConfig + def convert_baichuan_weights(baichuan, cfg: HookedTransformerConfig): state_dict = {} state_dict["embed.W_E"] = baichuan.model.embed_tokens.weight - + assert cfg.d_mlp is not None # keep mypy happy for l in range(cfg.n_layers): @@ -23,7 +25,7 @@ def convert_baichuan_weights(baichuan, cfg: HookedTransformerConfig): state_dict[f"blocks.{l}.attn.W_Q"] = W_Q state_dict[f"blocks.{l}.attn.W_K"] = W_K state_dict[f"blocks.{l}.attn.W_V"] = W_V - + state_dict[f"blocks.{l}.attn.b_Q"] = torch.zeros( cfg.n_heads, cfg.d_head, dtype=cfg.dtype, device=W_Q.device ) @@ -39,7 +41,7 @@ def convert_baichuan_weights(baichuan, cfg: HookedTransformerConfig): dtype=cfg.dtype, device=W_Q.device, ) - + W_O = baichuan.model.layers[l].self_attn.o_proj.weight W_O = einops.rearrange(W_O, "m (n h)->n h m", n=cfg.n_heads) state_dict[f"blocks.{l}.attn.W_O"] = W_O @@ -48,16 +50,16 @@ def convert_baichuan_weights(baichuan, cfg: HookedTransformerConfig): ) state_dict[f"blocks.{l}.ln2.w"] = baichuan.model.layers[l].post_attention_layernorm.weight - + state_dict[f"blocks.{l}.mlp.W_in"] = baichuan.model.layers[l].mlp.up_proj.weight.T state_dict[f"blocks.{l}.mlp.W_gate"] = baichuan.model.layers[l].mlp.gate_proj.weight.T state_dict[f"blocks.{l}.mlp.b_in"] = torch.zeros(cfg.d_mlp, dtype=W_O.dtype) state_dict[f"blocks.{l}.mlp.W_out"] = baichuan.model.layers[l].mlp.down_proj.weight.T state_dict[f"blocks.{l}.mlp.b_out"] = torch.zeros(cfg.d_model, dtype=W_O.dtype) - + state_dict["ln_final.w"] = baichuan.model.norm.weight state_dict["unembed.W_U"] = baichuan.lm_head.weight.T state_dict["unembed.b_U"] = torch.zeros(cfg.d_vocab, dtype=W_O.dtype) - - return state_dict \ No newline at end of file + + return state_dict From 2f44d5be1b10ee70f321be2631731d0b10f1f9d5 Mon Sep 17 00:00:00 2001 From: Bryce Meyer Date: Sat, 6 Jul 2024 01:36:45 +0200 Subject: [PATCH 7/7] tried setting pos weight --- transformer_lens/pretrained/weight_conversions/baichuan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformer_lens/pretrained/weight_conversions/baichuan.py b/transformer_lens/pretrained/weight_conversions/baichuan.py index b85ef191b..12637c748 100644 --- a/transformer_lens/pretrained/weight_conversions/baichuan.py +++ b/transformer_lens/pretrained/weight_conversions/baichuan.py @@ -59,6 +59,7 @@ def convert_baichuan_weights(baichuan, cfg: HookedTransformerConfig): state_dict[f"blocks.{l}.mlp.b_out"] = torch.zeros(cfg.d_model, dtype=W_O.dtype) state_dict["ln_final.w"] = baichuan.model.norm.weight + state_dict["pos_embed.W_pos"] = baichuan.model.transformer.wpe.weight state_dict["unembed.W_U"] = baichuan.lm_head.weight.T state_dict["unembed.b_U"] = torch.zeros(cfg.d_vocab, dtype=W_O.dtype)