-
Notifications
You must be signed in to change notification settings - Fork 43
Expand file tree
/
Copy pathtiny_shakespeare_dataset.cc
More file actions
119 lines (101 loc) · 4.73 KB
/
tiny_shakespeare_dataset.cc
File metadata and controls
119 lines (101 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#include "example/common/tiny_shakespeare_dataset.h"
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <functional>
#include <iostream>
#include <memory>
#include <numeric>
#include <string>
#include <unordered_map>
#include <utility>
#include <variant>
#include <vector>
#include "glog/logging.h"
#include "infini_train/include/tensor.h"
namespace {
using DataType = infini_train::DataType;
using TinyShakespeareType = TinyShakespeareDataset::TinyShakespeareType;
using TinyShakespeareFile = TinyShakespeareDataset::TinyShakespeareFile;
const std::unordered_map<int, TinyShakespeareType> kTypeMap = {
{20240520, TinyShakespeareType::kUINT16}, // GPT-2
{20240801, TinyShakespeareType::kUINT32}, // LLaMA 3
};
const std::unordered_map<TinyShakespeareType, size_t> kTypeToSize = {
{TinyShakespeareType::kUINT16, 2},
{TinyShakespeareType::kUINT32, 4},
};
const std::unordered_map<TinyShakespeareType, DataType> kTypeToDataType = {
{TinyShakespeareType::kUINT16, DataType::kUINT16},
{TinyShakespeareType::kUINT32, DataType::kINT32},
};
std::vector<uint8_t> ReadSeveralBytesFromIfstream(size_t num_bytes, std::ifstream *ifs) {
std::vector<uint8_t> result(num_bytes);
ifs->read(reinterpret_cast<char *>(result.data()), num_bytes);
return result;
}
template <typename T> T BytesToType(const std::vector<uint8_t> &bytes, size_t offset) {
static_assert(std::is_trivially_copyable<T>::value, "T must be trivially copyable.");
T value;
std::memcpy(&value, &bytes[offset], sizeof(T));
return value;
}
TinyShakespeareFile ReadTinyShakespeareFile(const std::string &path, size_t sequence_length) {
/*
HEADER (1024 bytes) | DATA (tokens) |
magic | version | num_toks | reserved |
4 bytes | 4 bytes | 4 bytes | 1012 bytes | # bytes |
*/
if (!std::filesystem::exists(path)) {
LOG(FATAL) << "File not found: " << path;
}
TinyShakespeareFile text_file;
std::ifstream ifs(path, std::ios::binary);
const auto header = ReadSeveralBytesFromIfstream(1024, &ifs);
const int magic = BytesToType<int32_t>(header, 0);
const int version = BytesToType<int32_t>(header, 4);
const int num_tokens = BytesToType<int32_t>(header, 8);
text_file.type = kTypeMap.at(magic);
const int num_sequences = num_tokens / sequence_length;
text_file.dims.assign({num_sequences, static_cast<int64_t>(sequence_length)});
const int data_size_in_bytes
= kTypeToSize.at(text_file.type)
* std::accumulate(text_file.dims.begin(), text_file.dims.end(), 1, std::multiplies<int>());
// shape: (num_seq, seq_len), dtype: int64
text_file.tensor = infini_train::Tensor(text_file.dims, DataType::kINT64);
int64_t *dst = static_cast<int64_t *>(text_file.tensor.DataPtr());
std::variant<std::vector<uint16_t>, std::vector<int32_t>> buffer;
if (text_file.type == TinyShakespeareType::kUINT16) {
// CHECK_LE(sequence_length, 1024); // GPT-2: max_seq_length = 1024
buffer = std::vector<uint16_t>(num_sequences * sequence_length);
} else if (text_file.type == TinyShakespeareType::kUINT32) {
CHECK_LE(sequence_length, 8192); // LLaMA-3: max_seq_length = 8192
buffer = std::vector<int32_t>(num_sequences * sequence_length);
}
std::visit(
[&](auto &vec) {
ifs.read(reinterpret_cast<char *>(vec.data()), data_size_in_bytes);
for (size_t i = 0; i < vec.size(); ++i) { dst[i] = static_cast<int64_t>(vec[i]); }
},
buffer);
return text_file;
}
} // namespace
TinyShakespeareDataset::TinyShakespeareDataset(const std::string &filepath, size_t sequence_length)
: text_file_(ReadTinyShakespeareFile(filepath, sequence_length)), sequence_length_(sequence_length),
sequence_size_in_bytes_(sequence_length * sizeof(int64_t)), num_samples_(text_file_.dims[0] - 1) {
CHECK_EQ(text_file_.dims[1], sequence_length_);
CHECK_EQ(static_cast<int>(text_file_.tensor.Dtype()), static_cast<int>(DataType::kINT64));
}
std::pair<std::shared_ptr<infini_train::Tensor>, std::shared_ptr<infini_train::Tensor>>
TinyShakespeareDataset::operator[](size_t idx) const {
CHECK_LT(idx, text_file_.dims[0] - 1);
std::vector<int64_t> dims = std::vector<int64_t>(text_file_.dims.begin() + 1, text_file_.dims.end());
// x: (seq_len), y: (seq_len) -> stack -> (bs, seq_len) (bs, seq_len)
return {std::make_shared<infini_train::Tensor>(text_file_.tensor, idx * sequence_size_in_bytes_, dims),
std::make_shared<infini_train::Tensor>(text_file_.tensor, idx * sequence_size_in_bytes_ + sizeof(int64_t),
dims)};
}
size_t TinyShakespeareDataset::Size() const { return num_samples_; }