Skip to content

Commit e16d915

Browse files
committed
re-quantize: moe models
1 parent da1fca5 commit e16d915

5 files changed

Lines changed: 23 additions & 11 deletions

File tree

src/backend.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ namespace chatllm
2121
tensor *init_tensor(ggml::tensor *tensor,
2222
ggml::type type,
2323
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
24+
tensor *init_tensor(ggml::tensor *tensor, ggml::tensor *like);
2425
void change_type(ggml::tensor *tensor, ggml::type type);
2526

2627
size_t element_size(const ggml::tensor *tensor);

src/chat.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,11 +1163,13 @@ namespace chatllm
11631163
}
11641164

11651165
size_t TensorInfo::read_tensor_data(tokenizer::DataReader *reader, size_t read_offset, size_t write_offset, size_t data_size,
1166-
ggml::type target_type)
1166+
ggml::type target_type, ggml::tensor *src_tensor)
11671167
{
11681168
CHATLLM_CHECK(data) << "backend buffer still not allocated!";
11691169
CHATLLM_CHECK(target_type == ggml::type_of(tensor)) << "tensor type mismatch!";
1170-
CHATLLM_CHECK(data->get_size() >= write_offset + data_size) << "read_tensor_data(" << ggml::get_name(&tensor) << "): write data exceeds tensor data size";
1170+
1171+
const ggml::type original_type = src_tensor ? ggml::type_of(src_tensor) : this->original_type;
1172+
if (nullptr == src_tensor) src_tensor = &tensor;
11711173

11721174
reader->seek(aligned_data_start(read_offset), SEEK_SET);
11731175

@@ -1185,7 +1187,7 @@ namespace chatllm
11851187
std::vector<uint8_t> buf_q;
11861188

11871189
ggml::tensor t;
1188-
ggml::init_tensor(&t, ggml::type::GGML_TYPE_F32, 4, tensor.ne);
1190+
ggml::init_tensor(&t, ggml::type::GGML_TYPE_F32, 4, src_tensor->ne);
11891191
buf.resize(ggml::nbytes(&t));
11901192

11911193
if (ggml::type::GGML_TYPE_F32 == original_type)
@@ -1194,18 +1196,19 @@ namespace chatllm
11941196
}
11951197
else
11961198
{
1197-
ggml::init_tensor(&t, original_type, 4, tensor.ne);
1199+
ggml::init_tensor(&t, original_type, 4, src_tensor->ne);
11981200
buf_q.resize(ggml::nbytes(&t));
11991201
reader->read_buffer(buf_q.data(), buf_q.size());
12001202

12011203
ggml::to_float(original_type, buf_q.data(), (float *)buf.data(), ggml::get_dim(&t, 0), ggml::nrows(&t));
12021204
}
12031205

1204-
ggml::init_tensor(&t, target_type, 4, tensor.ne);
1206+
ggml::init_tensor(&t, target_type, 4, src_tensor->ne);
12051207
buf_q.resize(ggml::nbytes(&t));
12061208
ggml::from_float(target_type, (const float *)buf.data(), (void *)buf_q.data(), ggml::get_dim(&t, 0), ggml::nrows(&t));
12071209

1208-
CHATLLM_CHECK(buf_q.size() == data_size) << "size mismatch? " << buf_q.size() << " : " << data_size;
1210+
if (data_size < buf_q.size())
1211+
CHATLLM_CHECK(buf_q.size() == data_size) << "size mismatch? " << buf_q.size() << " : " << data_size;
12091212

12101213
alloc->get_backend()->write_tensor_data(&tensor, buf_q.data(), write_offset, buf_q.size());
12111214

@@ -1530,12 +1533,14 @@ namespace chatllm
15301533
}
15311534

15321535
size_t size = search->second.get_nbytes();
1533-
t.read_tensor_data(_file.get(), search->second._offset, write_offset, size, tensor->type);
1536+
size = t.read_tensor_data(_file.get(), search->second._offset, write_offset, size, tensor->type, &search->second.tensor);
1537+
1538+
CHATLLM_CHECK(total_size >= size) << "tensor " << name << " too much data: " << total_size << " > " << size;
15341539

15351540
write_offset += size;
15361541
total_size -= size;
15371542
}
1538-
CHATLLM_CHECK(total_size == 0) << "tensor " << name << " not fully loaded.";
1543+
CHATLLM_CHECK(total_size == 0) << "tensor " << name << " not fully loaded, remain = " << total_size;
15391544

15401545
t.assign_to(tensor);
15411546
}

src/chat.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -686,7 +686,7 @@ namespace chatllm
686686

687687
bool load(tokenizer::DataReader *reader, LayerBufAllocator *alloc, ggml::type target_type, size_t override_buffer_size = 0);
688688

689-
size_t read_tensor_data(tokenizer::DataReader *reader, size_t read_offset, size_t write_offset, size_t data_size, ggml::type target_type);
689+
size_t read_tensor_data(tokenizer::DataReader *reader, size_t read_offset, size_t write_offset, size_t data_size, ggml::type target_type, ggml::tensor *src_tensor = nullptr);
690690
size_t read_raw_tensor_data(tokenizer::DataReader *reader, size_t data_size, void *p);
691691

692692
size_t aligned_data_start(size_t offset);

src/layers.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,12 @@ namespace chatllm
228228
return ggml::init_tensor(tensor, type, 4, ne);
229229
}
230230

231+
ggml::tensor *ggml::init_tensor(ggml::tensor *tensor, ggml::tensor *like)
232+
{
233+
int64_t ne[4] = {ggml::get_dim(like, 0), ggml::get_dim(like, 1), ggml::get_dim(like, 2), ggml::get_dim(like, 3)};
234+
return ggml::init_tensor(tensor, ggml::type_of(like), 4, ne);
235+
}
236+
231237
ggml::tensor *ggml::init_tensor(ggml::tensor *tensor,
232238
ggml::type type,
233239
int n_dims,

src/layers.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ namespace chatllm
572572
bias(use_bias ? ggml::new_tensor_1d(ctx, GGML_TYPE_F32, out_features) : nullptr) {}
573573

574574
Linear(InitContext *ctx, int in_features, int out_features, ggml::tensor *weight, bool use_bias = true)
575-
: weight(weight != NULL ? weight : ggml::new_tensor_2d(ctx, ctx->dtype, in_features, out_features)),
575+
: weight(weight != NULL ? weight : ggml::new_tensor_2d(ctx, ggml::type_fallback(ctx->dtype, in_features), in_features, out_features)),
576576
bias(use_bias ? ggml::new_tensor_1d(ctx, GGML_TYPE_F32, out_features) : nullptr) {}
577577

578578
int in_features() const { return (int)weight->ne[0]; }
@@ -604,7 +604,7 @@ namespace chatllm
604604
{}
605605

606606
MultiLinear(InitContext *ctx, int in_features, int out_features, int multi, bool use_bias)
607-
: weight(ggml::new_tensor_3d(ctx, ctx->dtype, in_features, out_features, multi)),
607+
: weight(ggml::new_tensor_3d(ctx, ggml::type_fallback(ctx->dtype, in_features), in_features, out_features, multi)),
608608
bias(use_bias ? ggml::new_tensor_2d(ctx, ggml::type::GGML_TYPE_F32, out_features, multi) : nullptr)
609609
{
610610
}

0 commit comments

Comments
 (0)