From cb8c146a3a91ae719ce8eb39168118d4871e767d Mon Sep 17 00:00:00 2001 From: medhat Date: Tue, 3 Mar 2026 17:20:36 +0400 Subject: [PATCH] fix: complete Falcon model support and add Pile dataset Falcon model: - Add missing skip_layer_name() (was abstract, caused instantiation failure) - Fix rotary_emb path: model.model.rotary_emb -> model.transformer.rotary_emb - Add lm_head to get_layers_except_blocks() - Read has_bias() from model config instead of hardcoding False - Use model_config.new_decoder_architecture instead of fragile architectures[0] check - Return both layernorms for old arch non-parallel_attn case - Use getattr with defaults for safer config attribute access Pile dataset: - Add 'pile' as calibration dataset (loads mit-han-lab/pile-val-backup) - Add pile_gptq preprocessor for GPTQ-style calibration sampling - Add 'pile' to eval_base supported datasets with data loading and encoding - Fix eval_base error message: self.dataset -> self.eval_dataset_name Tested with tiiuae/falcon-7b (old arch) and tiiuae/falcon-40b (new arch). Made-with: Cursor --- llmc/data/dataset/base_dataset.py | 5 ++++ llmc/data/dataset/specified_preproc.py | 12 +++++++++ llmc/eval/eval_base.py | 11 +++++++- llmc/models/falcon.py | 36 ++++++++++++++------------ 4 files changed, 47 insertions(+), 17 deletions(-) diff --git a/llmc/data/dataset/base_dataset.py b/llmc/data/dataset/base_dataset.py index 7af3de73a..bd773d920 100755 --- a/llmc/data/dataset/base_dataset.py +++ b/llmc/data/dataset/base_dataset.py @@ -38,6 +38,7 @@ def __init__(self, tokenizer, calib_cfg, batch_process=None): self.seed = calib_cfg['seed'] self.calib_dataset_field_map = { 'pileval': 'text', + 'pile': 'text', 'c4': 'text', 'wikitext2': 'text', 'ptb': 'sentence', @@ -66,6 +67,10 @@ def build_calib_dataset(self): self.calib_dataset = load_dataset( 'ptb_text_only', 'penn_treebank', split='train' ) + elif self.calib_dataset_name == 'pile': + self.calib_dataset = load_dataset( + 'mit-han-lab/pile-val-backup', split='validation' + ) elif self.calib_dataset_name == 'ultrachat': self.calib_dataset = load_dataset( 'HuggingFaceH4/ultrachat_200k', split='train_sft' diff --git a/llmc/data/dataset/specified_preproc.py b/llmc/data/dataset/specified_preproc.py index a996fead3..29dae1d5a 100644 --- a/llmc/data/dataset/specified_preproc.py +++ b/llmc/data/dataset/specified_preproc.py @@ -47,6 +47,18 @@ def c4_gptq(calib_dataset, tokenizer, n_samples, seq_len): return samples +@PREPROC_REGISTRY +def pile_gptq(calib_dataset, tokenizer, n_samples, seq_len): + trainenc = tokenizer('\n\n'.join(calib_dataset['text'][:1000]), return_tensors='pt') + samples = [] + for _ in range(n_samples): + i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1) + j = i + seq_len + inp = trainenc.input_ids[:, i:j] + samples.append(inp) + return samples + + @PREPROC_REGISTRY def pileval_awq(calib_dataset, tokenizer, n_samples, seq_len): dataset = calib_dataset.shuffle(seed=42) diff --git a/llmc/eval/eval_base.py b/llmc/eval/eval_base.py index 7916b2059..60a60589e 100755 --- a/llmc/eval/eval_base.py +++ b/llmc/eval/eval_base.py @@ -25,6 +25,7 @@ def __init__(self, model, config): 'wikitext2', 'c4', 'ptb', + 'pile', 'custom', 'human_eval', 'mme', @@ -32,7 +33,7 @@ def __init__(self, model, config): 'custom_gen', 't2v', 'i2v', - ], f'Not support {self.dataset} dataset now.' + ], f'Not support {self.eval_dataset_name} dataset now.' self.seq_len = self.eval_cfg.get('seq_len', None) self.num_samples = self.eval_cfg.get('num_samples', None) self.num_eval_tokens = self.eval_cfg.get('num_eval_tokens', None) @@ -67,6 +68,10 @@ def build_data(self): testdata = load_dataset( 'ptb_text_only', 'penn_treebank', split='test' ) + elif self.eval_dataset_name == 'pile': + testdata = load_dataset( + 'mit-han-lab/pile-val-backup', split='validation' + ) else: if self.eval_dataset_name in ['custom_gen', 'custom_ppl', 't2v', 'i2v']: testdata = self.get_cutomdata(self.eval_dataset_path) @@ -91,6 +96,10 @@ def build_data(self): testenc = self.tokenizer( ' '.join(testdata['sentence']), return_tensors='pt' ) + elif self.eval_dataset_name == 'pile': + testenc = self.tokenizer( + '\n\n'.join(testdata['text'][:1000]), return_tensors='pt' + ) elif self.eval_dataset_name == 'custom_ppl': testenc = self.tokenizer( '\n'.join([data['question'] + data['answer'] if 'answer' in data else data['question'] for data in testdata]), # noqa diff --git a/llmc/models/falcon.py b/llmc/models/falcon.py index e0823dc22..eadb0ae28 100644 --- a/llmc/models/falcon.py +++ b/llmc/models/falcon.py @@ -8,12 +8,15 @@ class Falcon(BaseModel): def __init__(self, config, device_map=None, use_cache=False): super().__init__(config, device_map, use_cache) + def _is_new_decoder_architecture(self): + return getattr(self.model_config, 'new_decoder_architecture', False) + def find_blocks(self): self.blocks = self.model.transformer.h def find_embed_layers(self): self.word_embeddings = self.model.transformer.word_embeddings - self.rotary_emb = self.model.model.rotary_emb + self.rotary_emb = self.model.transformer.rotary_emb def find_block_name(self): self.block_name_prefix = 'model.transformer.h' @@ -25,30 +28,31 @@ def get_attention_rotary_layers(self): return [self.rotary_emb] def get_layers_except_blocks(self): - return [self.word_embeddings, self.rotary_emb, self.model.transformer.ln_f] + return [self.word_embeddings, self.rotary_emb, self.model.transformer.ln_f, + self.model.lm_head] + + def skip_layer_name(self): + return ['lm_head'] def has_bias(self): - return False + return getattr(self.model_config, 'bias', False) def get_layernorms_in_block(self, block): - if block.config.architectures[0] == 'RWForCausalLM': - new_decoder_architecture = False - elif block.config.architectures[0] == 'FalconForCausalLM': - new_decoder_architecture = True - if new_decoder_architecture: + if self._is_new_decoder_architecture(): return {'ln_attn': block.ln_attn, 'ln_mlp': block.ln_mlp} else: - if block.config.parallel_attn: + if getattr(block.config, 'parallel_attn', False): return {'input_layernorm': block.input_layernorm} else: - return {'post_attention_layernorm': block.post_attention_layernorm} + return { + 'input_layernorm': block.input_layernorm, + 'post_attention_layernorm': block.post_attention_layernorm, + } def get_subsets_in_block(self, block): - if block.config.architectures[0] == 'RWForCausalLM': - new_decoder_architecture = False - elif block.config.architectures[0] == 'FalconForCausalLM': - new_decoder_architecture = True - if new_decoder_architecture: + new_arch = self._is_new_decoder_architecture() + + if new_arch: subset1 = { 'layers': { 'self_attention.query_key_value': ( @@ -79,7 +83,7 @@ def get_subsets_in_block(self, block): 'inspect': block.self_attention.query_key_value, 'has_kwargs': False, } - if block.config.parallel_attn: + if getattr(block.config, 'parallel_attn', False): subset3 = { 'layers': {'mlp.dense_h_to_4h': block.mlp.dense_h_to_4h}, 'prev_op': [block.input_layernorm],