Skip to content

Commit 35f17bd

Browse files
committed
fixed bottleneck!
1 parent 683af65 commit 35f17bd

File tree

3 files changed

+140
-63
lines changed

3 files changed

+140
-63
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
setup(
3535
name="libbbf",
36-
version="0.2.4", # Bumped version to ensure a clean release
36+
version="0.2.10",
3737
author="EF1500",
3838
author_email="rosemilovelockofficial@proton.me",
3939
description="Bound Book Format (BBF) tools and bindings",

src/bbf_reader.h

Lines changed: 109 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,19 @@
22
#include "libbbf.h"
33
#define XXH_INLINE_ALL
44
#include "xxhash.h"
5+
56
#include <string>
7+
#include <string_view> // C++17: Essential for zero-copy parsing
68
#include <vector>
79
#include <map>
810
#include <cstring>
911
#include <future>
1012
#include <thread>
11-
#include <mutex>
13+
#include <algorithm>
1214

1315
// Platform specific includes for MMAP
1416
#ifdef _WIN32
17+
#define WIN32_LEAN_AND_MEAN
1518
#include <windows.h>
1619
#else
1720
#include <sys/mman.h>
@@ -20,7 +23,6 @@
2023
#include <unistd.h>
2124
#endif
2225

23-
// Simple Memory Mapping Wrapper
2426
struct MemoryMappedFile {
2527
void* data = nullptr;
2628
size_t size = 0;
@@ -38,33 +40,53 @@ struct MemoryMappedFile {
3840
LARGE_INTEGER li;
3941
GetFileSizeEx(hFile, &li);
4042
size = (size_t)li.QuadPart;
43+
if (size == 0) { CloseHandle(hFile); return false; }
4144
hMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
42-
if (!hMap) return false;
45+
if (!hMap) { CloseHandle(hFile); return false; }
4346
data = MapViewOfFile(hMap, FILE_MAP_READ, 0, 0, 0);
4447
#else
4548
fd = open(path.c_str(), O_RDONLY);
4649
if (fd < 0) return false;
4750
struct stat st;
48-
fstat(fd, &st);
51+
if (fstat(fd, &st) < 0) { close(fd); return false; }
4952
size = st.st_size;
53+
if (size == 0) { close(fd); return false; }
5054
data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
55+
if (data == MAP_FAILED) { data = nullptr; close(fd); return false; }
5156
#endif
5257
return data != nullptr;
5358
}
5459

55-
~MemoryMappedFile() {
60+
void unmap() {
61+
if (!data) return;
5662
#ifdef _WIN32
57-
if (data) UnmapViewOfFile(data);
63+
UnmapViewOfFile(data);
5864
if (hMap) CloseHandle(hMap);
5965
if (hFile != INVALID_HANDLE_VALUE) CloseHandle(hFile);
66+
hMap = NULL; hFile = INVALID_HANDLE_VALUE;
6067
#else
61-
if (data && data != MAP_FAILED) munmap(data, size);
68+
munmap(data, size);
6269
if (fd >= 0) close(fd);
70+
fd = -1;
6371
#endif
72+
data = nullptr;
73+
size = 0;
6474
}
75+
76+
~MemoryMappedFile() { unmap(); }
6577
};
6678

6779
class BBFReader {
80+
private:
81+
// Cached pointers to avoid recalculating offsets repeatedly
82+
const char* data_ptr = nullptr;
83+
const BBFSection* sections_ = nullptr;
84+
const BBFMetadata* meta_ = nullptr;
85+
const BBFPageEntry* pages_ = nullptr;
86+
const BBFAssetEntry* assets_ = nullptr;
87+
const char* stringPool_ = nullptr;
88+
size_t stringPoolSize_ = 0;
89+
6890
public:
6991
BBFFooter footer;
7092
BBFHeader header;
@@ -73,22 +95,39 @@ class BBFReader {
7395

7496
BBFReader(const std::string& path) {
7597
if (!mmap.map(path)) return;
98+
data_ptr = static_cast<const char*>(mmap.data);
99+
100+
// Basic Size Check
76101
if (mmap.size < sizeof(BBFHeader) + sizeof(BBFFooter)) return;
77102

78-
std::memcpy(&header, mmap.data, sizeof(BBFHeader));
103+
// Read Header
104+
std::memcpy(&header, data_ptr, sizeof(BBFHeader));
79105
if (std::memcmp(header.magic, "BBF1", 4) != 0) return;
80106

81-
std::memcpy(&footer, (uint8_t*)mmap.data + mmap.size - sizeof(BBFFooter), sizeof(BBFFooter));
107+
// Read Footer
108+
std::memcpy(&footer, data_ptr + mmap.size - sizeof(BBFFooter), sizeof(BBFFooter));
82109
if (std::memcmp(footer.magic, "BBF1", 4) != 0) return;
83110

111+
// Cache Table Pointers
112+
// Note: In production, you should add bounds checks here to ensure offsets are within mmap.size
113+
sections_ = reinterpret_cast<const BBFSection*>(data_ptr + footer.sectionTableOffset);
114+
meta_ = reinterpret_cast<const BBFMetadata*>(data_ptr + footer.metaTableOffset);
115+
pages_ = reinterpret_cast<const BBFPageEntry*>(data_ptr + footer.pageTableOffset);
116+
assets_ = reinterpret_cast<const BBFAssetEntry*>(data_ptr + footer.assetTableOffset);
117+
118+
stringPool_ = data_ptr + footer.stringPoolOffset;
119+
stringPoolSize_ = footer.assetTableOffset - footer.stringPoolOffset;
120+
84121
isValid = true;
85122
}
86123

87-
std::string getString(uint32_t offset) const {
88-
const char* poolStart = (const char*)mmap.data + footer.stringPoolOffset;
89-
size_t poolSize = footer.assetTableOffset - footer.stringPoolOffset;
90-
if (offset >= poolSize) return "";
91-
return std::string(poolStart + offset);
124+
// Optimized: Returns string_view (no allocation)
125+
// Helper to allow returning std::string for legacy binding support if needed,
126+
// but internal logic should prefer views.
127+
std::string_view getStringView(uint32_t offset) const {
128+
if (offset >= stringPoolSize_) return {};
129+
// Requires strings in file to be null-terminated.
130+
return std::string_view(stringPool_ + offset);
92131
}
93132

94133
struct PySection {
@@ -97,86 +136,105 @@ class BBFReader {
97136
uint32_t parent;
98137
};
99138

100-
std::vector<PySection> getSections() {
139+
std::vector<PySection> getSections() const {
101140
std::vector<PySection> result;
102141
if (!isValid) return result;
103-
const BBFSection* secs = reinterpret_cast<const BBFSection*>((const uint8_t*)mmap.data + footer.sectionTableOffset);
142+
143+
result.reserve(footer.sectionCount); // Optimization: Reserve memory
104144
for (uint32_t i = 0; i < footer.sectionCount; i++) {
105-
result.push_back({getString(secs[i].sectionTitleOffset), secs[i].sectionStartIndex, secs[i].parentSectionIndex});
145+
// Explicit conversion to std::string here is okay as we are handing off to Python
146+
result.push_back({
147+
std::string(getStringView(sections_[i].sectionTitleOffset)),
148+
sections_[i].sectionStartIndex,
149+
sections_[i].parentSectionIndex
150+
});
106151
}
107152
return result;
108153
}
109154

110-
std::vector<std::pair<std::string, std::string>> getMetadata() {
155+
std::vector<std::pair<std::string, std::string>> getMetadata() const {
111156
std::vector<std::pair<std::string, std::string>> result;
112157
if (!isValid) return result;
113-
const BBFMetadata* meta = reinterpret_cast<const BBFMetadata*>((const uint8_t*)mmap.data + footer.metaTableOffset);
158+
159+
result.reserve(footer.keyCount);
114160
for (uint32_t i = 0; i < footer.keyCount; i++) {
115-
result.push_back({getString(meta[i].keyOffset), getString(meta[i].valOffset)});
161+
result.emplace_back(
162+
getStringView(meta_[i].keyOffset),
163+
getStringView(meta_[i].valOffset)
164+
);
116165
}
117166
return result;
118167
}
119168

120-
// Helper to get raw bytes for Python
121-
std::string getPageBytes(uint32_t pageIndex) {
122-
if (!isValid || pageIndex >= footer.pageCount) return "";
169+
// Zero-copy accessor for PyBind
170+
// Returns {pointer, size}
171+
std::pair<const char*, size_t> getPageRaw(uint32_t pageIndex) const {
172+
if (!isValid || pageIndex >= footer.pageCount) return {nullptr, 0};
123173

124-
const BBFPageEntry* pages = reinterpret_cast<const BBFPageEntry*>((const uint8_t*)mmap.data + footer.pageTableOffset);
125-
const BBFAssetEntry* assets = reinterpret_cast<const BBFAssetEntry*>((const uint8_t*)mmap.data + footer.assetTableOffset);
126-
127-
const auto& asset = assets[pages[pageIndex].assetIndex];
128-
return std::string((const char*)mmap.data + asset.offset, asset.length);
174+
// Indirect addressing: Page -> Asset -> Offset/Length
175+
const auto& asset = assets_[pages_[pageIndex].assetIndex];
176+
return { data_ptr + asset.offset, asset.length };
129177
}
130178

131-
std::map<std::string, uint64_t> getPageInfo(uint32_t pageIndex) {
132-
std::map<std::string, uint64_t> info;
133-
if (!isValid || pageIndex >= footer.pageCount) return info;
179+
// Legacy support (copies data)
180+
std::string getPageBytes(uint32_t pageIndex) const {
181+
auto raw = getPageRaw(pageIndex);
182+
if (!raw.first) return "";
183+
return std::string(raw.first, raw.second);
184+
}
134185

135-
const BBFPageEntry* pages = reinterpret_cast<const BBFPageEntry*>((const uint8_t*)mmap.data + footer.pageTableOffset);
136-
const BBFAssetEntry* assets = reinterpret_cast<const BBFAssetEntry*>((const uint8_t*)mmap.data + footer.assetTableOffset);
137-
const auto& asset = assets[pages[pageIndex].assetIndex];
186+
std::map<std::string, uint64_t> getPageInfo(uint32_t pageIndex) const {
187+
if (!isValid || pageIndex >= footer.pageCount) return {};
138188

139-
info["length"] = asset.length;
140-
info["offset"] = asset.offset;
141-
info["hash"] = asset.xxh3Hash;
142-
info["type"] = asset.type;
143-
return info;
189+
const auto& asset = assets_[pages_[pageIndex].assetIndex];
190+
return {
191+
{"length", asset.length},
192+
{"offset", asset.offset},
193+
{"hash", asset.xxh3Hash},
194+
{"type", asset.type}
195+
};
144196
}
145197

146-
// Implements verifyAssetsParallel from bbfenc.cpp
147-
bool verify() {
198+
bool verify() const {
148199
if (!isValid) return false;
149200

150201
// 1. Directory Hash Check
151202
size_t metaStart = footer.stringPoolOffset;
152203
size_t metaSize = mmap.size - sizeof(BBFFooter) - metaStart;
153-
uint64_t calcIndexHash = XXH3_64bits((const uint8_t*)mmap.data + metaStart, metaSize);
154-
155-
if (calcIndexHash != footer.indexHash) return false;
204+
if (XXH3_64bits(data_ptr + metaStart, metaSize) != footer.indexHash) return false;
156205

157206
// 2. Asset Integrity Check
158-
const BBFAssetEntry* assets = reinterpret_cast<const BBFAssetEntry*>((const uint8_t*)mmap.data + footer.assetTableOffset);
159207
size_t count = footer.assetCount;
208+
const auto* local_assets = assets_; // Copy pointer for lambda capture
209+
const auto* local_data = data_ptr;
160210

161-
auto verifyRange = [&](size_t start, size_t end) -> bool {
211+
auto verifyRange = [local_assets, local_data](size_t start, size_t end) -> bool {
162212
for (size_t i = start; i < end; ++i) {
163-
const auto& a = assets[i];
164-
uint64_t h = XXH3_64bits((const uint8_t*)mmap.data + a.offset, a.length);
165-
if (h != a.xxh3Hash) return false;
213+
const auto& a = local_assets[i];
214+
if (XXH3_64bits((const uint8_t*)local_data + a.offset, a.length) != a.xxh3Hash) {
215+
return false;
216+
}
166217
}
167218
return true;
168219
};
169220

170-
// Determine thread count
221+
// Optimization: Don't spawn threads for small files
171222
size_t numThreads = std::thread::hardware_concurrency();
172223
if (numThreads == 0) numThreads = 1;
173-
224+
225+
// Heuristic: If assets < 128, threading overhead > hashing gain
226+
if (count < 128 || numThreads == 1) {
227+
return verifyRange(0, count);
228+
}
229+
174230
size_t chunkSize = count / numThreads;
175231
std::vector<std::future<bool>> futures;
232+
futures.reserve(numThreads);
176233

177234
for (size_t i = 0; i < numThreads; ++i) {
178235
size_t start = i * chunkSize;
179236
size_t end = (i == numThreads - 1) ? count : start + chunkSize;
237+
// Launch async
180238
futures.push_back(std::async(std::launch::async, verifyRange, start, end));
181239
}
182240

src/bindings.cpp

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,31 +26,50 @@ PYBIND11_MODULE(libbbf, m) {
2626
py::class_<BBFReader>(m, "BBFReader")
2727
.def(py::init<const std::string &>())
2828
.def_readonly("is_valid", &BBFReader::isValid)
29+
.def_readonly("footer", &BBFReader::footer) // Optional: expose footer struct directly if bound
2930
.def("get_page_count", [](BBFReader& r) { return r.footer.pageCount; })
3031
.def("get_asset_count", [](BBFReader& r) { return r.footer.assetCount; })
3132

32-
// FIX: Manually convert the C++ struct to a Python dictionary
33+
.def("verify", &BBFReader::verify,
34+
py::call_guard<py::gil_scoped_release>(), // IMPORTANT: Release GIL during long hashing
35+
"Verify integrity of index and assets. Multithreaded.")
36+
3337
.def("get_sections", [](BBFReader& r) {
38+
// Optimizing the conversion loop
3439
py::list result;
35-
auto sections = r.getSections();
40+
const auto sections = r.getSections();
3641
for (const auto& s : sections) {
3742
py::dict d;
38-
d["title"] = s.title;
43+
d["title"] = s.title; // Moves string
3944
d["startPage"] = s.startPage;
4045
d["parent"] = s.parent;
4146
result.append(d);
4247
}
4348
return result;
44-
}, "Returns a list of dictionaries [{'title': str, 'startPage': int, 'parent': int}]")
49+
}, "Returns sections as [{'title': str, 'startPage': int, 'parent': int}]")
4550

4651
.def("get_metadata", &BBFReader::getMetadata,
4752
"Returns a list of (Key, Value) tuples.")
53+
4854
.def("get_page_data", [](BBFReader& r, uint32_t idx) {
49-
std::string s = r.getPageBytes(idx);
50-
return py::bytes(s);
51-
}, "Returns the raw bytes of the page asset.")
52-
.def("get_page_info", &BBFReader::getPageInfo,
53-
"Returns dict with keys: length, offset, hash, type.")
54-
.def("verify", &BBFReader::verify, py::call_guard<py::gil_scoped_release>(),
55-
"Performs full XXH3 integrity check on directory and all assets.");
55+
auto raw = r.getPageRaw(idx);
56+
if (!raw.first) return py::bytes("");
57+
// 1-Copy: Copies from mmap -> Python Bytes Object
58+
return py::bytes(raw.first, raw.second);
59+
}, "Returns the raw bytes of the page asset (1-Copy).")
60+
61+
.def("get_page_view", [](BBFReader& r, uint32_t idx) {
62+
auto raw = r.getPageRaw(idx);
63+
if (!raw.first) return py::memoryview(py::bytes(""));
64+
65+
// 0-Copy: Direct view into mmap
66+
// Warning: This view crashes Python if BBFReader is garbage collected before the view!
67+
// To fix this lifetime issue, we use 'py::keep_alive'.
68+
return py::memoryview::from_memory(
69+
const_cast<char*>(raw.first),
70+
raw.second,
71+
true // read-only
72+
);
73+
}, py::keep_alive<0, 1>(), // Keep BBFReader (1) alive while memoryview (0) exists
74+
"Returns a zero-copy memoryview of the asset. Fastest method.");
5675
}

0 commit comments

Comments
 (0)