Skip to content

Commit e701a73

Browse files
committed
DBString: Dedup strings
1 parent 980f88d commit e701a73

2 files changed

Lines changed: 136 additions & 53 deletions

File tree

src/dbstring.cpp

Lines changed: 113 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,141 @@
11
#include "lcf/dbstring.h"
2+
#include <unordered_map>
3+
#include <memory>
4+
#include <iostream>
25

36
namespace lcf {
47

58
constexpr DBString::size_type DBString::npos;
9+
alignas(DBString::size_type) constexpr char DBString::_empty_str[sizeof(size_type)];
610

7-
static constexpr size_t AllocSize(size_t len) {
8-
return sizeof(DBString::size_type) + len + 1;
9-
}
11+
struct DBStringData {
12+
using size_type = DBString::size_type;
1013

11-
static char* Alloc(size_t len) {
12-
return reinterpret_cast<char*>(::operator new(AllocSize(len)));
13-
}
14+
size_type refcnt;
15+
size_type size;
1416

15-
static char* Dup(const char* other, size_t size) {
16-
if (size > 0) {
17-
auto* s = Alloc(size);
18-
std::memcpy(s, other, AllocSize(size));
19-
return s;
17+
const char* data() const {
18+
return reinterpret_cast<const char*>(this + 1);
2019
}
21-
return nullptr;
22-
}
23-
24-
static void Free(void* p) {
25-
::operator delete(p);
26-
}
27-
28-
DBString::DBString(const char* s, size_t len)
29-
{
30-
if (len > 0) {
31-
auto* ptr = Alloc(len);
32-
_storage = ptr;
3320

34-
*reinterpret_cast<size_type*>(ptr) = len;
35-
ptr += sizeof(size_type);
21+
char* data() {
22+
return reinterpret_cast<char*>(this + 1);
23+
}
3624

37-
std::memcpy(ptr, s, len);
38-
ptr += len;
25+
static size_type alloc_size(StringView str) {
26+
return sizeof(DBStringData) + str.size() + 1;
27+
}
3928

40-
*ptr = '\0';
29+
static DBStringData* from_data(char* s) {
30+
return reinterpret_cast<DBStringData*>(s) - 1;
4131
}
32+
};
33+
34+
struct DBStringDataDeleter {
35+
void operator()(DBStringData* p);
36+
};
37+
38+
using DBStringDataPtr = std::unique_ptr<DBStringData,DBStringDataDeleter>;
39+
40+
class DBStringAllocator {
41+
public:
42+
using size_type = DBString::size_type;
43+
44+
static DBStringDataPtr Alloc(StringView str) {
45+
auto* raw = ::operator new(DBStringData::alloc_size(str));
46+
auto data = DBStringDataPtr(new (raw) DBStringData());
47+
data->refcnt = 1;
48+
data->size = str.size();
49+
std::memcpy(data->data(), str.data(), data->size);
50+
data->data()[data->size] = '\0';
51+
52+
return data;
53+
}
54+
55+
static void Free(DBStringData* data) {
56+
data->~DBStringData();
57+
::operator delete(data);
58+
}
59+
60+
const char* Acquire(StringView str) {
61+
if (str.empty()) {
62+
return DBString::empty_str();
63+
}
64+
65+
auto iter = _map.find(str);
66+
if (iter != _map.end()) {
67+
iter->second->refcnt += 1;
68+
} else {
69+
auto ptr = Alloc(str);
70+
auto sv = StringView(ptr->data(), ptr->size);
71+
// FIXME: Double hash lookup because the key changes..
72+
iter = _map.insert({ sv, std::move(ptr) }).first;
73+
}
74+
return iter->second->data();
75+
}
76+
77+
const char* Dup(const char* s) {
78+
if (s != DBString::empty_str()) {
79+
auto* data = DBStringData::from_data(const_cast<char*>(s));
80+
data->refcnt += 1;
81+
}
82+
return s;
83+
}
84+
85+
void Release(StringView str) {
86+
if (str.empty()) {
87+
// This is needed, due to global DBStrings which are initialized to null.
88+
// They may be destroyed *after* DBStringAllocator is destroyed!
89+
// FIMXE: To fix this, use a hash table with constexpr default constructor.
90+
return;
91+
}
92+
auto iter = _map.find(str);
93+
if (iter != _map.end()) {
94+
auto& data = iter->second;
95+
data->refcnt -= 1;
96+
assert(data->refcnt >= 0);
97+
if (data->refcnt == 0) {
98+
_map.erase(iter);
99+
}
100+
}
101+
}
102+
103+
static DBStringAllocator& instance() {
104+
static DBStringAllocator alloc;
105+
return alloc;
106+
}
107+
private:
108+
DBStringAllocator() = default;
109+
private:
110+
std::unordered_map<StringView,DBStringDataPtr> _map;
111+
};
112+
113+
void DBStringDataDeleter::operator()(DBStringData* p) {
114+
DBStringAllocator::Free(p);
115+
}
116+
117+
DBString::DBString(StringView s)
118+
: _storage(DBStringAllocator::instance().Acquire(s))
119+
{
42120
}
43121

44122
DBString::DBString(const DBString& o)
45-
: _storage(Dup(o._storage, o.size()))
46-
{ }
123+
: _storage(DBStringAllocator::instance().Dup(o._storage))
124+
{
125+
}
47126

48127
DBString& DBString::operator=(const DBString& o) {
49128
if (this != &o) {
129+
// What is strings are the same, skip double lookup?
50130
_reset();
51-
_storage = Dup(o._storage, o.size());
131+
_storage = DBStringAllocator::instance().Dup(o._storage);
52132
}
53133
return *this;
54134
}
55135

56-
57-
58136
void DBString::_reset() noexcept {
59-
Free(_storage);
60-
_storage = nullptr;
137+
assert(_storage != nullptr);
138+
DBStringAllocator::instance().Release(*this);
61139
}
62140

63141
} // namespace lcf

src/lcf/dbstring.h

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
#include "lcf/string_view.h"
2222

23-
2423
namespace lcf {
2524

2625
// A read-only string class optimized for database storage.
@@ -36,22 +35,27 @@ class DBString {
3635

3736
static constexpr size_type npos = size_type(-1);
3837

39-
constexpr DBString() = default;
40-
explicit DBString(const std::string& s) : DBString(s.c_str(), s.size()) {}
41-
explicit DBString(StringView s) : DBString(s.data(), s.size()) {}
38+
// FIXME: When the allocator constructor is constexpr, this can be also
39+
DBString() : DBString(StringView()) {}
40+
explicit DBString(StringView s);
41+
explicit DBString(const std::string& s) : DBString(StringView(s)) {}
4242

4343
// Explicit construct for general const char*
44-
explicit DBString(const char* s) : DBString(s, std::strlen(s)) {}
44+
explicit DBString(const char* s) : DBString(StringView(s)) {}
4545
// Implicit constructor to capture string literals
4646
template <size_t N>
4747
DBString(const char(&literal)[N]) : DBString(StringView(literal)) {}
48-
DBString(const char* s, size_t len);
48+
DBString(const char* s, size_t len) : DBString(StringView(s, len)) {}
4949

50-
DBString(const DBString&);
50+
DBString(const DBString& o);
5151
DBString& operator=(const DBString&);
5252
DBString(DBString&&) noexcept;
5353
DBString& operator=(DBString&&) noexcept;
5454

55+
void swap(DBString& o) noexcept {
56+
std::swap(_storage, o._storage);
57+
}
58+
5559
~DBString() { _reset(); }
5660

5761
explicit operator std::string() const { return std::string(c_str(), size()); }
@@ -60,7 +64,7 @@ class DBString {
6064
char operator[](size_t i) const;
6165
char front() const { return (*this)[0]; }
6266
char back() const { return (*this)[size()-1]; }
63-
const char* data() const;
67+
const char* data() const { return _storage; }
6468
const char* c_str() const { return data(); }
6569

6670
iterator begin() const { return data(); }
@@ -71,10 +75,15 @@ class DBString {
7175

7276
bool empty() const { return size() == 0; }
7377
size_type size() const;
78+
79+
static constexpr const char* empty_str() {
80+
return _empty_str + sizeof(size_type);
81+
}
7482
private:
7583
void _reset() noexcept;
7684
private:
77-
char* _storage = nullptr;
85+
alignas(size_type) static constexpr char _empty_str[sizeof(size_type)] = {};
86+
const char* _storage = empty_str();
7887
};
7988

8089
// This should be used over the conversion operator, so we can track all dbstr -> str instances
@@ -116,28 +125,24 @@ template <> struct hash<lcf::DBString> {
116125
namespace lcf {
117126

118127
inline DBString::DBString(DBString&& o) noexcept
119-
: _storage(o._storage)
120128
{
121-
o._storage = nullptr;
129+
std::swap(_storage, o._storage);
122130
}
123131

124132
inline DBString& DBString::operator=(DBString&& o) noexcept {
133+
return operator=(o);
125134
if (this != &o) {
126-
if (_storage) {
135+
if (!empty()) {
127136
_reset();
128137
}
129138
_storage = o._storage;
130-
o._storage = nullptr;
139+
o._storage = empty_str();
131140
}
132141
return *this;
133142
}
134143

135-
inline const char* DBString::data() const {
136-
return _storage ? _storage + sizeof(size_type) : nullptr;
137-
}
138-
139144
inline DBString::size_type DBString::size() const {
140-
return _storage ? *reinterpret_cast<const size_type*>(_storage) : 0;
145+
return *(reinterpret_cast<const size_type*>(_storage) - 1);
141146
}
142147

143148
} // namespace lcf

0 commit comments

Comments
 (0)