AffineDataCopyGeneration.cpp
11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===//
//
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a pass to automatically promote accessed memref regions
// to buffers in a faster memory space that is explicitly managed, with the
// necessary data movement operations performed through either regular
// point-wise load/store's or DMAs. Such explicit copying (also referred to as
// array packing/unpacking in the literature), when done on arrays that exhibit
// reuse, results in near elimination of conflict misses, TLB misses, reduced
// use of hardware prefetch streams, and reduced false sharing. It is also
// necessary for hardware that explicitly managed levels in the memory
// hierarchy, and where DMAs may have to be used. This optimization is often
// performed on already tiled code.
//
//===----------------------------------------------------------------------===//
#include "mlir/Analysis/Utils.h"
#include "mlir/Dialect/AffineOps/AffineOps.h"
#include "mlir/IR/Builders.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
#include "mlir/Transforms/Utils.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include <algorithm>
#define DEBUG_TYPE "affine-data-copy-generate"
using namespace mlir;
static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
static llvm::cl::opt<unsigned long long> clFastMemoryCapacity(
"affine-data-copy-generate-fast-mem-capacity",
llvm::cl::desc(
"Set fast memory space capacity in KiB (default: unlimited)"),
llvm::cl::cat(clOptionsCategory));
static llvm::cl::opt<bool>
clDma("affine-data-copy-generate-dma",
llvm::cl::desc("Generate DMA instead of point-wise copy"),
llvm::cl::cat(clOptionsCategory), llvm::cl::init(true));
static llvm::cl::opt<unsigned> clFastMemorySpace(
"affine-data-copy-generate-fast-mem-space", llvm::cl::init(1),
llvm::cl::desc(
"Fast memory space identifier for copy generation (default: 1)"),
llvm::cl::cat(clOptionsCategory));
static llvm::cl::opt<bool> clSkipNonUnitStrideLoop(
"affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden,
llvm::cl::init(false),
llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths "
"for copy placement"),
llvm::cl::cat(clOptionsCategory));
namespace {
/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
/// introducing copy operations to transfer data into `fastMemorySpace` and
/// rewriting the original load's/store's to instead load/store from the
/// allocated fast memory buffers. Additional options specify the identifier
/// corresponding to the fast memory space and the amount of fast memory space
/// available. The pass traverses through the nesting structure, recursing to
/// inner levels if necessary to determine at what depth copies need to be
/// placed so that the allocated buffers fit within the memory capacity
/// provided.
// TODO(bondhugula): We currently can't generate copies correctly when stores
// are strided. Check for strided stores.
struct AffineDataCopyGeneration
: public FunctionPass<AffineDataCopyGeneration> {
explicit AffineDataCopyGeneration(
unsigned slowMemorySpace = 0,
unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0,
int minDmaTransferSize = 1024,
uint64_t fastMemCapacityBytes =
(clFastMemoryCapacity.getNumOccurrences() > 0
? clFastMemoryCapacity * 1024 // cl-provided size is in KiB
: std::numeric_limits<uint64_t>::max()),
bool generateDma = clDma,
bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop)
: slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace),
tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize),
fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma),
skipNonUnitStrideLoops(skipNonUnitStrideLoops) {}
explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other)
: slowMemorySpace(other.slowMemorySpace),
fastMemorySpace(other.fastMemorySpace),
tagMemorySpace(other.tagMemorySpace),
minDmaTransferSize(other.minDmaTransferSize),
fastMemCapacityBytes(other.fastMemCapacityBytes),
generateDma(other.generateDma),
skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {}
void runOnFunction() override;
LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests);
// Slow memory space associated with copies.
const unsigned slowMemorySpace;
// Fast memory space associated with copies.
unsigned fastMemorySpace;
// Memory space associated with DMA tags.
unsigned tagMemorySpace;
// Minimum DMA transfer size supported by the target in bytes.
const int minDmaTransferSize;
// Capacity of the faster memory space.
uint64_t fastMemCapacityBytes;
// If set, generate DMA operations instead of read/write.
bool generateDma;
// If set, ignore loops with steps other than 1.
bool skipNonUnitStrideLoops;
// Constant zero index to avoid too many duplicates.
Value zeroIndex = nullptr;
};
} // end anonymous namespace
/// Generates copies for memref's living in 'slowMemorySpace' into newly created
/// buffers in 'fastMemorySpace', and replaces memory operations to the former
/// by the latter. Only load op's handled for now.
/// TODO(bondhugula): extend this to store op's.
std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass(
unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace,
int minDmaTransferSize, uint64_t fastMemCapacityBytes) {
return std::make_unique<AffineDataCopyGeneration>(
slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize,
fastMemCapacityBytes);
}
/// Generate copies for this block. The block is partitioned into separate
/// ranges: each range is either a sequence of one or more operations starting
/// and ending with an affine load or store op, or just an affine.forop (which
/// could have other affine for op's nested within).
LogicalResult
AffineDataCopyGeneration::runOnBlock(Block *block,
DenseSet<Operation *> ©Nests) {
if (block->empty())
return success();
AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
fastMemorySpace, tagMemorySpace,
fastMemCapacityBytes};
// Every affine.forop in the block starts and ends a block range for copying;
// in addition, a contiguous sequence of operations starting with a
// load/store op but not including any copy nests themselves is also
// identified as a copy block range. Straightline code (a contiguous chunk of
// operations excluding AffineForOp's) are always assumed to not exhaust
// memory. As a result, this approach is conservative in some cases at the
// moment; we do a check later and report an error with location info.
// TODO(bondhugula): An 'affine.if' operation is being treated similar to an
// operation. 'affine.if''s could have 'affine.for's in them;
// treat them separately.
// Get to the first load, store, or for op (that is not a copy nest itself).
auto curBegin =
std::find_if(block->begin(), block->end(), [&](Operation &op) {
return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
isa<AffineForOp>(op)) &&
copyNests.count(&op) == 0;
});
// Create [begin, end) ranges.
auto it = curBegin;
while (it != block->end()) {
AffineForOp forOp;
// If you hit a non-copy for loop, we will split there.
if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) {
// Perform the copying up unti this 'for' op first.
affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions,
copyNests);
// Returns true if the footprint is known to exceed capacity.
auto exceedsCapacity = [&](AffineForOp forOp) {
Optional<int64_t> footprint =
getMemoryFootprintBytes(forOp,
/*memorySpace=*/0);
return (footprint.hasValue() &&
static_cast<uint64_t>(footprint.getValue()) >
fastMemCapacityBytes);
};
// If the memory footprint of the 'affine.for' loop is higher than fast
// memory capacity (when provided), we recurse to copy at an inner level
// until we find a depth at which footprint fits in fast mem capacity. If
// the footprint can't be calculated, we assume for now it fits. Recurse
// inside if footprint for 'forOp' exceeds capacity, or when
// skipNonUnitStrideLoops is set and the step size is not one.
bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1
: exceedsCapacity(forOp);
if (recurseInner) {
// We'll recurse and do the copies at an inner level for 'forInst'.
// Recurse onto the body of this loop.
runOnBlock(forOp.getBody(), copyNests);
} else {
// We have enough capacity, i.e., copies will be computed for the
// portion of the block until 'it', and for 'it', which is 'forOp'. Note
// that for the latter, the copies are placed just before this loop (for
// incoming copies) and right after (for outgoing ones).
// Inner loop copies have their own scope - we don't thus update
// consumed capacity. The footprint check above guarantees this inner
// loop's footprint fits.
affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions,
copyNests);
}
// Get to the next load or store op after 'forOp'.
curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) {
return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) ||
isa<AffineForOp>(op)) &&
copyNests.count(&op) == 0;
});
it = curBegin;
} else {
assert(copyNests.count(&*it) == 0 &&
"all copy nests generated should have been skipped above");
// We simply include this op in the current range and continue for more.
++it;
}
}
// Generate the copy for the final block range.
if (curBegin != block->end()) {
// Can't be a terminator because it would have been skipped above.
assert(!curBegin->isKnownTerminator() && "can't be a terminator");
// Exclude the affine terminator - hence, the std::prev.
affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()),
copyOptions, copyNests);
}
return success();
}
void AffineDataCopyGeneration::runOnFunction() {
FuncOp f = getFunction();
OpBuilder topBuilder(f.getBody());
zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0);
// Nests that are copy-in's or copy-out's; the root AffineForOps of those
// nests are stored herein.
DenseSet<Operation *> copyNests;
// Clear recorded copy nests.
copyNests.clear();
for (auto &block : f)
runOnBlock(&block, copyNests);
// Promote any single iteration loops in the copy nests.
for (auto nest : copyNests) {
nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
}
}
static PassRegistration<AffineDataCopyGeneration>
pass("affine-data-copy-generate",
"Generate explicit copying for memory operations");