22#include " libbbf.h"
33#define XXH_INLINE_ALL
44#include " xxhash.h"
5+
56#include < string>
7+ #include < string_view> // C++17: Essential for zero-copy parsing
68#include < vector>
79#include < map>
810#include < cstring>
911#include < future>
1012#include < thread>
11- #include < mutex >
13+ #include < algorithm >
1214
1315// Platform specific includes for MMAP
1416#ifdef _WIN32
17+ #define WIN32_LEAN_AND_MEAN
1518#include < windows.h>
1619#else
1720#include < sys/mman.h>
2023#include < unistd.h>
2124#endif
2225
23- // Simple Memory Mapping Wrapper
2426struct MemoryMappedFile {
2527 void * data = nullptr ;
2628 size_t size = 0 ;
@@ -38,33 +40,53 @@ struct MemoryMappedFile {
3840 LARGE_INTEGER li;
3941 GetFileSizeEx (hFile, &li);
4042 size = (size_t )li.QuadPart ;
43+ if (size == 0 ) { CloseHandle (hFile); return false ; }
4144 hMap = CreateFileMapping (hFile, NULL , PAGE_READONLY, 0 , 0 , NULL );
42- if (!hMap) return false ;
45+ if (!hMap) { CloseHandle (hFile); return false ; }
4346 data = MapViewOfFile (hMap, FILE_MAP_READ, 0 , 0 , 0 );
4447#else
4548 fd = open (path.c_str (), O_RDONLY);
4649 if (fd < 0 ) return false ;
4750 struct stat st;
48- fstat (fd, &st);
51+ if ( fstat (fd, &st) < 0 ) { close (fd); return false ; }
4952 size = st.st_size ;
53+ if (size == 0 ) { close (fd); return false ; }
5054 data = mmap (NULL , size, PROT_READ, MAP_PRIVATE, fd, 0 );
55+ if (data == MAP_FAILED) { data = nullptr ; close (fd); return false ; }
5156#endif
5257 return data != nullptr ;
5358 }
5459
55- ~MemoryMappedFile () {
60+ void unmap () {
61+ if (!data) return ;
5662#ifdef _WIN32
57- if (data) UnmapViewOfFile (data);
63+ UnmapViewOfFile (data);
5864 if (hMap) CloseHandle (hMap);
5965 if (hFile != INVALID_HANDLE_VALUE) CloseHandle (hFile);
66+ hMap = NULL ; hFile = INVALID_HANDLE_VALUE;
6067#else
61- if (data && data != MAP_FAILED) munmap (data, size);
68+ munmap (data, size);
6269 if (fd >= 0 ) close (fd);
70+ fd = -1 ;
6371#endif
72+ data = nullptr ;
73+ size = 0 ;
6474 }
75+
76+ ~MemoryMappedFile () { unmap (); }
6577};
6678
6779class BBFReader {
80+ private:
81+ // Cached pointers to avoid recalculating offsets repeatedly
82+ const char * data_ptr = nullptr ;
83+ const BBFSection* sections_ = nullptr ;
84+ const BBFMetadata* meta_ = nullptr ;
85+ const BBFPageEntry* pages_ = nullptr ;
86+ const BBFAssetEntry* assets_ = nullptr ;
87+ const char * stringPool_ = nullptr ;
88+ size_t stringPoolSize_ = 0 ;
89+
6890public:
6991 BBFFooter footer;
7092 BBFHeader header;
@@ -73,22 +95,39 @@ class BBFReader {
7395
7496 BBFReader (const std::string& path) {
7597 if (!mmap.map (path)) return ;
98+ data_ptr = static_cast <const char *>(mmap.data );
99+
100+ // Basic Size Check
76101 if (mmap.size < sizeof (BBFHeader) + sizeof (BBFFooter)) return ;
77102
78- std::memcpy (&header, mmap.data , sizeof (BBFHeader));
103+ // Read Header
104+ std::memcpy (&header, data_ptr, sizeof (BBFHeader));
79105 if (std::memcmp (header.magic , " BBF1" , 4 ) != 0 ) return ;
80106
81- std::memcpy (&footer, (uint8_t *)mmap.data + mmap.size - sizeof (BBFFooter), sizeof (BBFFooter));
107+ // Read Footer
108+ std::memcpy (&footer, data_ptr + mmap.size - sizeof (BBFFooter), sizeof (BBFFooter));
82109 if (std::memcmp (footer.magic , " BBF1" , 4 ) != 0 ) return ;
83110
111+ // Cache Table Pointers
112+ // Note: In production, you should add bounds checks here to ensure offsets are within mmap.size
113+ sections_ = reinterpret_cast <const BBFSection*>(data_ptr + footer.sectionTableOffset );
114+ meta_ = reinterpret_cast <const BBFMetadata*>(data_ptr + footer.metaTableOffset );
115+ pages_ = reinterpret_cast <const BBFPageEntry*>(data_ptr + footer.pageTableOffset );
116+ assets_ = reinterpret_cast <const BBFAssetEntry*>(data_ptr + footer.assetTableOffset );
117+
118+ stringPool_ = data_ptr + footer.stringPoolOffset ;
119+ stringPoolSize_ = footer.assetTableOffset - footer.stringPoolOffset ;
120+
84121 isValid = true ;
85122 }
86123
87- std::string getString (uint32_t offset) const {
88- const char * poolStart = (const char *)mmap.data + footer.stringPoolOffset ;
89- size_t poolSize = footer.assetTableOffset - footer.stringPoolOffset ;
90- if (offset >= poolSize) return " " ;
91- return std::string (poolStart + offset);
124+ // Optimized: Returns string_view (no allocation)
125+ // Helper to allow returning std::string for legacy binding support if needed,
126+ // but internal logic should prefer views.
127+ std::string_view getStringView (uint32_t offset) const {
128+ if (offset >= stringPoolSize_) return {};
129+ // Requires strings in file to be null-terminated.
130+ return std::string_view (stringPool_ + offset);
92131 }
93132
94133 struct PySection {
@@ -97,86 +136,105 @@ class BBFReader {
97136 uint32_t parent;
98137 };
99138
100- std::vector<PySection> getSections () {
139+ std::vector<PySection> getSections () const {
101140 std::vector<PySection> result;
102141 if (!isValid) return result;
103- const BBFSection* secs = reinterpret_cast <const BBFSection*>((const uint8_t *)mmap.data + footer.sectionTableOffset );
142+
143+ result.reserve (footer.sectionCount ); // Optimization: Reserve memory
104144 for (uint32_t i = 0 ; i < footer.sectionCount ; i++) {
105- result.push_back ({getString (secs[i].sectionTitleOffset ), secs[i].sectionStartIndex , secs[i].parentSectionIndex });
145+ // Explicit conversion to std::string here is okay as we are handing off to Python
146+ result.push_back ({
147+ std::string (getStringView (sections_[i].sectionTitleOffset )),
148+ sections_[i].sectionStartIndex ,
149+ sections_[i].parentSectionIndex
150+ });
106151 }
107152 return result;
108153 }
109154
110- std::vector<std::pair<std::string, std::string>> getMetadata () {
155+ std::vector<std::pair<std::string, std::string>> getMetadata () const {
111156 std::vector<std::pair<std::string, std::string>> result;
112157 if (!isValid) return result;
113- const BBFMetadata* meta = reinterpret_cast <const BBFMetadata*>((const uint8_t *)mmap.data + footer.metaTableOffset );
158+
159+ result.reserve (footer.keyCount );
114160 for (uint32_t i = 0 ; i < footer.keyCount ; i++) {
115- result.push_back ({getString (meta[i].keyOffset ), getString (meta[i].valOffset )});
161+ result.emplace_back (
162+ getStringView (meta_[i].keyOffset ),
163+ getStringView (meta_[i].valOffset )
164+ );
116165 }
117166 return result;
118167 }
119168
120- // Helper to get raw bytes for Python
121- std::string getPageBytes (uint32_t pageIndex) {
122- if (!isValid || pageIndex >= footer.pageCount ) return " " ;
169+ // Zero-copy accessor for PyBind
170+ // Returns {pointer, size}
171+ std::pair<const char *, size_t > getPageRaw (uint32_t pageIndex) const {
172+ if (!isValid || pageIndex >= footer.pageCount ) return {nullptr , 0 };
123173
124- const BBFPageEntry* pages = reinterpret_cast <const BBFPageEntry*>((const uint8_t *)mmap.data + footer.pageTableOffset );
125- const BBFAssetEntry* assets = reinterpret_cast <const BBFAssetEntry*>((const uint8_t *)mmap.data + footer.assetTableOffset );
126-
127- const auto & asset = assets[pages[pageIndex].assetIndex ];
128- return std::string ((const char *)mmap.data + asset.offset , asset.length );
174+ // Indirect addressing: Page -> Asset -> Offset/Length
175+ const auto & asset = assets_[pages_[pageIndex].assetIndex ];
176+ return { data_ptr + asset.offset , asset.length };
129177 }
130178
131- std::map<std::string, uint64_t > getPageInfo (uint32_t pageIndex) {
132- std::map<std::string, uint64_t > info;
133- if (!isValid || pageIndex >= footer.pageCount ) return info;
179+ // Legacy support (copies data)
180+ std::string getPageBytes (uint32_t pageIndex) const {
181+ auto raw = getPageRaw (pageIndex);
182+ if (!raw.first ) return " " ;
183+ return std::string (raw.first , raw.second );
184+ }
134185
135- const BBFPageEntry* pages = reinterpret_cast <const BBFPageEntry*>((const uint8_t *)mmap.data + footer.pageTableOffset );
136- const BBFAssetEntry* assets = reinterpret_cast <const BBFAssetEntry*>((const uint8_t *)mmap.data + footer.assetTableOffset );
137- const auto & asset = assets[pages[pageIndex].assetIndex ];
186+ std::map<std::string, uint64_t > getPageInfo (uint32_t pageIndex) const {
187+ if (!isValid || pageIndex >= footer.pageCount ) return {};
138188
139- info[" length" ] = asset.length ;
140- info[" offset" ] = asset.offset ;
141- info[" hash" ] = asset.xxh3Hash ;
142- info[" type" ] = asset.type ;
143- return info;
189+ const auto & asset = assets_[pages_[pageIndex].assetIndex ];
190+ return {
191+ {" length" , asset.length },
192+ {" offset" , asset.offset },
193+ {" hash" , asset.xxh3Hash },
194+ {" type" , asset.type }
195+ };
144196 }
145197
146- // Implements verifyAssetsParallel from bbfenc.cpp
147- bool verify () {
198+ bool verify () const {
148199 if (!isValid) return false ;
149200
150201 // 1. Directory Hash Check
151202 size_t metaStart = footer.stringPoolOffset ;
152203 size_t metaSize = mmap.size - sizeof (BBFFooter) - metaStart;
153- uint64_t calcIndexHash = XXH3_64bits ((const uint8_t *)mmap.data + metaStart, metaSize);
154-
155- if (calcIndexHash != footer.indexHash ) return false ;
204+ if (XXH3_64bits (data_ptr + metaStart, metaSize) != footer.indexHash ) return false ;
156205
157206 // 2. Asset Integrity Check
158- const BBFAssetEntry* assets = reinterpret_cast <const BBFAssetEntry*>((const uint8_t *)mmap.data + footer.assetTableOffset );
159207 size_t count = footer.assetCount ;
208+ const auto * local_assets = assets_; // Copy pointer for lambda capture
209+ const auto * local_data = data_ptr;
160210
161- auto verifyRange = [& ](size_t start, size_t end) -> bool {
211+ auto verifyRange = [local_assets, local_data ](size_t start, size_t end) -> bool {
162212 for (size_t i = start; i < end; ++i) {
163- const auto & a = assets[i];
164- uint64_t h = XXH3_64bits ((const uint8_t *)mmap.data + a.offset , a.length );
165- if (h != a.xxh3Hash ) return false ;
213+ const auto & a = local_assets[i];
214+ if (XXH3_64bits ((const uint8_t *)local_data + a.offset , a.length ) != a.xxh3Hash ) {
215+ return false ;
216+ }
166217 }
167218 return true ;
168219 };
169220
170- // Determine thread count
221+ // Optimization: Don't spawn threads for small files
171222 size_t numThreads = std::thread::hardware_concurrency ();
172223 if (numThreads == 0 ) numThreads = 1 ;
173-
224+
225+ // Heuristic: If assets < 128, threading overhead > hashing gain
226+ if (count < 128 || numThreads == 1 ) {
227+ return verifyRange (0 , count);
228+ }
229+
174230 size_t chunkSize = count / numThreads;
175231 std::vector<std::future<bool >> futures;
232+ futures.reserve (numThreads);
176233
177234 for (size_t i = 0 ; i < numThreads; ++i) {
178235 size_t start = i * chunkSize;
179236 size_t end = (i == numThreads - 1 ) ? count : start + chunkSize;
237+ // Launch async
180238 futures.push_back (std::async (std::launch::async, verifyRange, start, end));
181239 }
182240
0 commit comments