SyncDependenceAnalysis.cpp
15.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
//===--- SyncDependenceAnalysis.cpp - Compute Control Divergence Effects --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements an algorithm that returns for a divergent branch
// the set of basic blocks whose phi nodes become divergent due to divergent
// control. These are the blocks that are reachable by two disjoint paths from
// the branch or loop exits that have a reaching path that is disjoint from a
// path to the loop latch.
//
// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
// control-induced divergence in phi nodes.
//
// -- Summary --
// The SyncDependenceAnalysis lazily computes sync dependences [3].
// The analysis evaluates the disjoint path criterion [2] by a reduction
// to SSA construction. The SSA construction algorithm is implemented as
// a simple data-flow analysis [1].
//
// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
// [2] "Efficiently Computing Static Single Assignment Form
// and the Control Dependence Graph", TOPLAS '91,
// Cytron, Ferrante, Rosen, Wegman and Zadeck
// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
//
// -- Sync dependence --
// Sync dependence [4] characterizes the control flow aspect of the
// propagation of branch divergence. For example,
//
// %cond = icmp slt i32 %tid, 10
// br i1 %cond, label %then, label %else
// then:
// br label %merge
// else:
// br label %merge
// merge:
// %a = phi i32 [ 0, %then ], [ 1, %else ]
//
// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
// because %tid is not on its use-def chains, %a is sync dependent on %tid
// because the branch "br i1 %cond" depends on %tid and affects which value %a
// is assigned to.
//
// -- Reduction to SSA construction --
// There are two disjoint paths from A to X, if a certain variant of SSA
// construction places a phi node in X under the following set-up scheme [2].
//
// This variant of SSA construction ignores incoming undef values.
// That is paths from the entry without a definition do not result in
// phi nodes.
//
// entry
// / \
// A \
// / \ Y
// B C /
// \ / \ /
// D E
// \ /
// F
// Assume that A contains a divergent branch. We are interested
// in the set of all blocks where each block is reachable from A
// via two disjoint paths. This would be the set {D, F} in this
// case.
// To generally reduce this query to SSA construction we introduce
// a virtual variable x and assign to x different values in each
// successor block of A.
// entry
// / \
// A \
// / \ Y
// x = 0 x = 1 /
// \ / \ /
// D E
// \ /
// F
// Our flavor of SSA construction for x will construct the following
// entry
// / \
// A \
// / \ Y
// x0 = 0 x1 = 1 /
// \ / \ /
// x2=phi E
// \ /
// x3=phi
// The blocks D and F contain phi nodes and are thus each reachable
// by two disjoins paths from A.
//
// -- Remarks --
// In case of loop exits we need to check the disjoint path criterion for loops
// [2]. To this end, we check whether the definition of x differs between the
// loop exit and the loop header (_after_ SSA construction).
//
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/SyncDependenceAnalysis.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include <functional>
#include <stack>
#include <unordered_set>
#define DEBUG_TYPE "sync-dependence"
// The SDA algorithm operates on a modified CFG - we modify the edges leaving
// loop headers as follows:
//
// * We remove all edges leaving all loop headers.
// * We add additional edges from the loop headers to their exit blocks.
//
// The modification is virtual, that is whenever we visit a loop header we
// pretend it had different successors.
namespace {
using namespace llvm;
// Custom Post-Order Traveral
//
// We cannot use the vanilla (R)PO computation of LLVM because:
// * We (virtually) modify the CFG.
// * We want a loop-compact block enumeration, that is the numbers assigned by
// the traveral to the blocks of a loop are an interval.
using POCB = std::function<void(const BasicBlock &)>;
using VisitedSet = std::set<const BasicBlock *>;
using BlockStack = std::vector<const BasicBlock *>;
// forward
static void computeLoopPO(const LoopInfo &LI, Loop &Loop, POCB CallBack,
VisitedSet &Finalized);
// for a nested region (top-level loop or nested loop)
static void computeStackPO(BlockStack &Stack, const LoopInfo &LI, Loop *Loop,
POCB CallBack, VisitedSet &Finalized) {
const auto *LoopHeader = Loop ? Loop->getHeader() : nullptr;
while (!Stack.empty()) {
const auto *NextBB = Stack.back();
auto *NestedLoop = LI.getLoopFor(NextBB);
bool IsNestedLoop = NestedLoop != Loop;
// Treat the loop as a node
if (IsNestedLoop) {
SmallVector<BasicBlock *, 3> NestedExits;
NestedLoop->getUniqueExitBlocks(NestedExits);
bool PushedNodes = false;
for (const auto *NestedExitBB : NestedExits) {
if (NestedExitBB == LoopHeader)
continue;
if (Loop && !Loop->contains(NestedExitBB))
continue;
if (Finalized.count(NestedExitBB))
continue;
PushedNodes = true;
Stack.push_back(NestedExitBB);
}
if (!PushedNodes) {
// All loop exits finalized -> finish this node
Stack.pop_back();
computeLoopPO(LI, *NestedLoop, CallBack, Finalized);
}
continue;
}
// DAG-style
bool PushedNodes = false;
for (const auto *SuccBB : successors(NextBB)) {
if (SuccBB == LoopHeader)
continue;
if (Loop && !Loop->contains(SuccBB))
continue;
if (Finalized.count(SuccBB))
continue;
PushedNodes = true;
Stack.push_back(SuccBB);
}
if (!PushedNodes) {
// Never push nodes twice
Stack.pop_back();
if (!Finalized.insert(NextBB).second)
continue;
CallBack(*NextBB);
}
}
}
static void computeTopLevelPO(Function &F, const LoopInfo &LI, POCB CallBack) {
VisitedSet Finalized;
BlockStack Stack;
Stack.reserve(24); // FIXME made-up number
Stack.push_back(&F.getEntryBlock());
computeStackPO(Stack, LI, nullptr, CallBack, Finalized);
}
static void computeLoopPO(const LoopInfo &LI, Loop &Loop, POCB CallBack,
VisitedSet &Finalized) {
/// Call CallBack on all loop blocks.
std::vector<const BasicBlock *> Stack;
const auto *LoopHeader = Loop.getHeader();
// Visit the header last
Finalized.insert(LoopHeader);
CallBack(*LoopHeader);
// Initialize with immediate successors
for (const auto *BB : successors(LoopHeader)) {
if (!Loop.contains(BB))
continue;
if (BB == LoopHeader)
continue;
Stack.push_back(BB);
}
// Compute PO inside region
computeStackPO(Stack, LI, &Loop, CallBack, Finalized);
}
} // namespace
namespace llvm {
ControlDivergenceDesc SyncDependenceAnalysis::EmptyDivergenceDesc;
SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT,
const PostDominatorTree &PDT,
const LoopInfo &LI)
: DT(DT), PDT(PDT), LI(LI) {
computeTopLevelPO(*DT.getRoot()->getParent(), LI,
[&](const BasicBlock &BB) { LoopPO.appendBlock(BB); });
}
SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
// divergence propagator for reducible CFGs
struct DivergencePropagator {
const ModifiedPO &LoopPOT;
const DominatorTree &DT;
const PostDominatorTree &PDT;
const LoopInfo &LI;
const BasicBlock &DivTermBlock;
// * if BlockLabels[IndexOf(B)] == C then C is the dominating definition at
// block B
// * if BlockLabels[IndexOf(B)] ~ undef then we haven't seen B yet
// * if BlockLabels[IndexOf(B)] == B then B is a join point of disjoint paths
// from X or B is an immediate successor of X (initial value).
using BlockLabelVec = std::vector<const BasicBlock *>;
BlockLabelVec BlockLabels;
// divergent join and loop exit descriptor.
std::unique_ptr<ControlDivergenceDesc> DivDesc;
DivergencePropagator(const ModifiedPO &LoopPOT, const DominatorTree &DT,
const PostDominatorTree &PDT, const LoopInfo &LI,
const BasicBlock &DivTermBlock)
: LoopPOT(LoopPOT), DT(DT), PDT(PDT), LI(LI), DivTermBlock(DivTermBlock),
BlockLabels(LoopPOT.size(), nullptr),
DivDesc(new ControlDivergenceDesc) {}
void printDefs(raw_ostream &Out) {
Out << "Propagator::BlockLabels {\n";
for (int BlockIdx = (int)BlockLabels.size() - 1; BlockIdx > 0; --BlockIdx) {
const auto *Label = BlockLabels[BlockIdx];
Out << LoopPOT.getBlockAt(BlockIdx)->getName().str() << "(" << BlockIdx
<< ") : ";
if (!Label) {
Out << "<null>\n";
} else {
Out << Label->getName() << "\n";
}
}
Out << "}\n";
}
// Push a definition (\p PushedLabel) to \p SuccBlock and return whether this
// causes a divergent join.
bool computeJoin(const BasicBlock &SuccBlock, const BasicBlock &PushedLabel) {
auto SuccIdx = LoopPOT.getIndexOf(SuccBlock);
// unset or same reaching label
const auto *OldLabel = BlockLabels[SuccIdx];
if (!OldLabel || (OldLabel == &PushedLabel)) {
BlockLabels[SuccIdx] = &PushedLabel;
return false;
}
// Update the definition
BlockLabels[SuccIdx] = &SuccBlock;
return true;
}
// visiting a virtual loop exit edge from the loop header --> temporal
// divergence on join
bool visitLoopExitEdge(const BasicBlock &ExitBlock,
const BasicBlock &DefBlock, bool FromParentLoop) {
// Pushing from a non-parent loop cannot cause temporal divergence.
if (!FromParentLoop)
return visitEdge(ExitBlock, DefBlock);
if (!computeJoin(ExitBlock, DefBlock))
return false;
// Identified a divergent loop exit
DivDesc->LoopDivBlocks.insert(&ExitBlock);
LLVM_DEBUG(dbgs() << "\tDivergent loop exit: " << ExitBlock.getName()
<< "\n");
return true;
}
// process \p SuccBlock with reaching definition \p DefBlock
bool visitEdge(const BasicBlock &SuccBlock, const BasicBlock &DefBlock) {
if (!computeJoin(SuccBlock, DefBlock))
return false;
// Divergent, disjoint paths join.
DivDesc->JoinDivBlocks.insert(&SuccBlock);
LLVM_DEBUG(dbgs() << "\tDivergent join: " << SuccBlock.getName());
return true;
}
std::unique_ptr<ControlDivergenceDesc> computeJoinPoints() {
assert(DivDesc);
LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: " << DivTermBlock.getName()
<< "\n");
const auto *DivBlockLoop = LI.getLoopFor(&DivTermBlock);
// Early stopping criterion
int FloorIdx = LoopPOT.size() - 1;
const BasicBlock *FloorLabel = nullptr;
// bootstrap with branch targets
int BlockIdx = 0;
for (const auto *SuccBlock : successors(&DivTermBlock)) {
auto SuccIdx = LoopPOT.getIndexOf(*SuccBlock);
BlockLabels[SuccIdx] = SuccBlock;
// Find the successor with the highest index to start with
BlockIdx = std::max<int>(BlockIdx, SuccIdx);
FloorIdx = std::min<int>(FloorIdx, SuccIdx);
// Identify immediate divergent loop exits
if (!DivBlockLoop)
continue;
const auto *BlockLoop = LI.getLoopFor(SuccBlock);
if (BlockLoop && DivBlockLoop->contains(BlockLoop))
continue;
DivDesc->LoopDivBlocks.insert(SuccBlock);
LLVM_DEBUG(dbgs() << "\tImmediate divergent loop exit: "
<< SuccBlock->getName() << "\n");
}
// propagate definitions at the immediate successors of the node in RPO
for (; BlockIdx >= FloorIdx; --BlockIdx) {
LLVM_DEBUG(dbgs() << "Before next visit:\n"; printDefs(dbgs()));
// Any label available here
const auto *Label = BlockLabels[BlockIdx];
if (!Label)
continue;
// Ok. Get the block
const auto *Block = LoopPOT.getBlockAt(BlockIdx);
LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n");
auto *BlockLoop = LI.getLoopFor(Block);
bool IsLoopHeader = BlockLoop && BlockLoop->getHeader() == Block;
bool CausedJoin = false;
int LoweredFloorIdx = FloorIdx;
if (IsLoopHeader) {
// Disconnect from immediate successors and propagate directly to loop
// exits.
SmallVector<BasicBlock *, 4> BlockLoopExits;
BlockLoop->getExitBlocks(BlockLoopExits);
bool IsParentLoop = BlockLoop->contains(&DivTermBlock);
for (const auto *BlockLoopExit : BlockLoopExits) {
CausedJoin |= visitLoopExitEdge(*BlockLoopExit, *Label, IsParentLoop);
LoweredFloorIdx = std::min<int>(LoweredFloorIdx,
LoopPOT.getIndexOf(*BlockLoopExit));
}
} else {
// Acyclic successor case
for (const auto *SuccBlock : successors(Block)) {
CausedJoin |= visitEdge(*SuccBlock, *Label);
LoweredFloorIdx =
std::min<int>(LoweredFloorIdx, LoopPOT.getIndexOf(*SuccBlock));
}
}
// Floor update
if (CausedJoin) {
// 1. Different labels pushed to successors
FloorIdx = LoweredFloorIdx;
} else if (FloorLabel != Label) {
// 2. No join caused BUT we pushed a label that is different than the
// last pushed label
FloorIdx = LoweredFloorIdx;
FloorLabel = Label;
}
}
LLVM_DEBUG(dbgs() << "SDA::joins. After propagation:\n"; printDefs(dbgs()));
return std::move(DivDesc);
}
};
#ifndef NDEBUG
static void printBlockSet(ConstBlockSet &Blocks, raw_ostream &Out) {
Out << "[";
bool First = true;
for (const auto *BB : Blocks) {
if (!First)
Out << ", ";
First = false;
Out << BB->getName();
}
Out << "]";
}
#endif
const ControlDivergenceDesc &
SyncDependenceAnalysis::getJoinBlocks(const Instruction &Term) {
// trivial case
if (Term.getNumSuccessors() <= 1) {
return EmptyDivergenceDesc;
}
// already available in cache?
auto ItCached = CachedControlDivDescs.find(&Term);
if (ItCached != CachedControlDivDescs.end())
return *ItCached->second;
// compute all join points
// Special handling of divergent loop exits is not needed for LCSSA
const auto &TermBlock = *Term.getParent();
DivergencePropagator Propagator(LoopPO, DT, PDT, LI, TermBlock);
auto DivDesc = Propagator.computeJoinPoints();
LLVM_DEBUG(dbgs() << "Result (" << Term.getParent()->getName() << "):\n";
dbgs() << "JoinDivBlocks: ";
printBlockSet(DivDesc->JoinDivBlocks, dbgs());
dbgs() << "\nLoopDivBlocks: ";
printBlockSet(DivDesc->LoopDivBlocks, dbgs()); dbgs() << "\n";);
auto ItInserted = CachedControlDivDescs.emplace(&Term, std::move(DivDesc));
assert(ItInserted.second);
return *ItInserted.first->second;
}
} // namespace llvm