MultiOnDiskHashTable.h
11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
//===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides a hash table data structure suitable for incremental and
// distributed storage across a set of files.
//
// Multiple hash tables from different files are implicitly merged to improve
// performance, and on reload the merged table will override those from other
// files.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
#define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/PointerUnion.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/EndianStream.h"
#include "llvm/Support/OnDiskHashTable.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cstdint>
#include <vector>
namespace clang {
namespace serialization {
/// A collection of on-disk hash tables, merged when relevant for performance.
template<typename Info> class MultiOnDiskHashTable {
public:
/// A handle to a file, used when overriding tables.
using file_type = typename Info::file_type;
/// A pointer to an on-disk representation of the hash table.
using storage_type = const unsigned char *;
using external_key_type = typename Info::external_key_type;
using internal_key_type = typename Info::internal_key_type;
using data_type = typename Info::data_type;
using data_type_builder = typename Info::data_type_builder;
using hash_value_type = unsigned;
private:
/// The generator is permitted to read our merged table.
template<typename ReaderInfo, typename WriterInfo>
friend class MultiOnDiskHashTableGenerator;
/// A hash table stored on disk.
struct OnDiskTable {
using HashTable = llvm::OnDiskIterableChainedHashTable<Info>;
file_type File;
HashTable Table;
OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries,
storage_type Buckets, storage_type Payload, storage_type Base,
const Info &InfoObj)
: File(File),
Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {}
};
struct MergedTable {
std::vector<file_type> Files;
llvm::DenseMap<internal_key_type, data_type> Data;
};
using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
using TableVector = llvm::TinyPtrVector<void *>;
/// The current set of on-disk and merged tables.
/// We manually store the opaque value of the Table because TinyPtrVector
/// can't cope with holding a PointerUnion directly.
/// There can be at most one MergedTable in this vector, and if present,
/// it is the first table.
TableVector Tables;
/// Files corresponding to overridden tables that we've not yet
/// discarded.
llvm::TinyPtrVector<file_type> PendingOverrides;
struct AsOnDiskTable {
using result_type = OnDiskTable *;
result_type operator()(void *P) const {
return Table::getFromOpaqueValue(P).template get<OnDiskTable *>();
}
};
using table_iterator =
llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>;
using table_range = llvm::iterator_range<table_iterator>;
/// The current set of on-disk tables.
table_range tables() {
auto Begin = Tables.begin(), End = Tables.end();
if (getMergedTable())
++Begin;
return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()),
llvm::map_iterator(End, AsOnDiskTable()));
}
MergedTable *getMergedTable() const {
// If we already have a merged table, it's the first one.
return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin())
.template dyn_cast<MergedTable*>();
}
/// Delete all our current on-disk tables.
void clear() {
for (auto *T : tables())
delete T;
if (auto *M = getMergedTable())
delete M;
Tables.clear();
}
void removeOverriddenTables() {
llvm::DenseSet<file_type> Files;
Files.insert(PendingOverrides.begin(), PendingOverrides.end());
// Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
auto ShouldRemove = [&Files](void *T) -> bool {
auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>();
bool Remove = Files.count(ODT->File);
if (Remove)
delete ODT;
return Remove;
};
Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(),
ShouldRemove),
Tables.end());
PendingOverrides.clear();
}
void condense() {
MergedTable *Merged = getMergedTable();
if (!Merged)
Merged = new MergedTable;
// Read in all the tables and merge them together.
// FIXME: Be smarter about which tables we merge.
for (auto *ODT : tables()) {
auto &HT = ODT->Table;
Info &InfoObj = HT.getInfoObj();
for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
auto *LocalPtr = I.getItem();
// FIXME: Don't rely on the OnDiskHashTable format here.
auto L = InfoObj.ReadKeyDataLength(LocalPtr);
const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
data_type_builder ValueBuilder(Merged->Data[Key]);
InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
ValueBuilder);
}
Merged->Files.push_back(ODT->File);
delete ODT;
}
Tables.clear();
Tables.push_back(Table(Merged).getOpaqueValue());
}
public:
MultiOnDiskHashTable() = default;
MultiOnDiskHashTable(MultiOnDiskHashTable &&O)
: Tables(std::move(O.Tables)),
PendingOverrides(std::move(O.PendingOverrides)) {
O.Tables.clear();
}
MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) {
if (&O == this)
return *this;
clear();
Tables = std::move(O.Tables);
O.Tables.clear();
PendingOverrides = std::move(O.PendingOverrides);
return *this;
}
~MultiOnDiskHashTable() { clear(); }
/// Add the table \p Data loaded from file \p File.
void add(file_type File, storage_type Data, Info InfoObj = Info()) {
using namespace llvm::support;
storage_type Ptr = Data;
uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr);
// Read the list of overridden files.
uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr);
// FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
// an additional copy.
llvm::SmallVector<file_type, 16> OverriddenFiles;
OverriddenFiles.reserve(NumFiles);
for (/**/; NumFiles != 0; --NumFiles)
OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr));
PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(),
OverriddenFiles.end());
// Read the OnDiskChainedHashTable header.
storage_type Buckets = Data + BucketOffset;
auto NumBucketsAndEntries =
OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
// Register the table.
Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
NumBucketsAndEntries.second,
Buckets, Ptr, Data, std::move(InfoObj));
Tables.push_back(NewTable.getOpaqueValue());
}
/// Find and read the lookup results for \p EKey.
data_type find(const external_key_type &EKey) {
data_type Result;
if (!PendingOverrides.empty())
removeOverriddenTables();
if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
condense();
internal_key_type Key = Info::GetInternalKey(EKey);
auto KeyHash = Info::ComputeHash(Key);
if (MergedTable *M = getMergedTable()) {
auto It = M->Data.find(Key);
if (It != M->Data.end())
Result = It->second;
}
data_type_builder ResultBuilder(Result);
for (auto *ODT : tables()) {
auto &HT = ODT->Table;
auto It = HT.find_hashed(Key, KeyHash);
if (It != HT.end())
HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
ResultBuilder);
}
return Result;
}
/// Read all the lookup results into a single value. This only makes
/// sense if merging values across keys is meaningful.
data_type findAll() {
data_type Result;
data_type_builder ResultBuilder(Result);
if (!PendingOverrides.empty())
removeOverriddenTables();
if (MergedTable *M = getMergedTable()) {
for (auto &KV : M->Data)
Info::MergeDataInto(KV.second, ResultBuilder);
}
for (auto *ODT : tables()) {
auto &HT = ODT->Table;
Info &InfoObj = HT.getInfoObj();
for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
auto *LocalPtr = I.getItem();
// FIXME: Don't rely on the OnDiskHashTable format here.
auto L = InfoObj.ReadKeyDataLength(LocalPtr);
const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder);
}
}
return Result;
}
};
/// Writer for the on-disk hash table.
template<typename ReaderInfo, typename WriterInfo>
class MultiOnDiskHashTableGenerator {
using BaseTable = MultiOnDiskHashTable<ReaderInfo>;
using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>;
Generator Gen;
public:
MultiOnDiskHashTableGenerator() : Gen() {}
void insert(typename WriterInfo::key_type_ref Key,
typename WriterInfo::data_type_ref Data, WriterInfo &Info) {
Gen.insert(Key, Data, Info);
}
void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info,
const BaseTable *Base) {
using namespace llvm::support;
llvm::raw_svector_ostream OutStream(Out);
// Write our header information.
{
endian::Writer Writer(OutStream, little);
// Reserve four bytes for the bucket offset.
Writer.write<uint32_t>(0);
if (auto *Merged = Base ? Base->getMergedTable() : nullptr) {
// Write list of overridden files.
Writer.write<uint32_t>(Merged->Files.size());
for (const auto &F : Merged->Files)
Info.EmitFileRef(OutStream, F);
// Add all merged entries from Base to the generator.
for (auto &KV : Merged->Data) {
if (!Gen.contains(KV.first, Info))
Gen.insert(KV.first, Info.ImportData(KV.second), Info);
}
} else {
Writer.write<uint32_t>(0);
}
}
// Write the table itself.
uint32_t BucketOffset = Gen.Emit(OutStream, Info);
// Replace the first four bytes with the bucket offset.
endian::write32le(Out.data(), BucketOffset);
}
};
} // namespace serialization
} // namespace clang
#endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H