ParallelLoopFusion.cpp
6.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
//===- ParallelLoopFusion.cpp - Code to perform loop fusion ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements loop fusion on parallel loops.
//
//===----------------------------------------------------------------------===//
#include "PassDetail.h"
#include "mlir/Dialect/SCF/Passes.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/SCF/Transforms.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/OpDefinition.h"
using namespace mlir;
using namespace mlir::scf;
/// Verify there are no nested ParallelOps.
static bool hasNestedParallelOp(ParallelOp ploop) {
auto walkResult =
ploop.getBody()->walk([](ParallelOp) { return WalkResult::interrupt(); });
return walkResult.wasInterrupted();
}
/// Verify equal iteration spaces.
static bool equalIterationSpaces(ParallelOp firstPloop,
ParallelOp secondPloop) {
if (firstPloop.getNumLoops() != secondPloop.getNumLoops())
return false;
auto matchOperands = [&](const OperandRange &lhs,
const OperandRange &rhs) -> bool {
// TODO: Extend this to support aliases and equal constants.
return std::equal(lhs.begin(), lhs.end(), rhs.begin());
};
return matchOperands(firstPloop.lowerBound(), secondPloop.lowerBound()) &&
matchOperands(firstPloop.upperBound(), secondPloop.upperBound()) &&
matchOperands(firstPloop.step(), secondPloop.step());
}
/// Checks if the parallel loops have mixed access to the same buffers. Returns
/// `true` if the first parallel loop writes to the same indices that the second
/// loop reads.
static bool haveNoReadsAfterWriteExceptSameIndex(
ParallelOp firstPloop, ParallelOp secondPloop,
const BlockAndValueMapping &firstToSecondPloopIndices) {
DenseMap<Value, SmallVector<ValueRange, 1>> bufferStores;
firstPloop.getBody()->walk([&](StoreOp store) {
bufferStores[store.getMemRef()].push_back(store.indices());
});
auto walkResult = secondPloop.getBody()->walk([&](LoadOp load) {
// Stop if the memref is defined in secondPloop body. Careful alias analysis
// is needed.
auto *memrefDef = load.getMemRef().getDefiningOp();
if (memrefDef && memrefDef->getBlock() == load.getOperation()->getBlock())
return WalkResult::interrupt();
auto write = bufferStores.find(load.getMemRef());
if (write == bufferStores.end())
return WalkResult::advance();
// Allow only single write access per buffer.
if (write->second.size() != 1)
return WalkResult::interrupt();
// Check that the load indices of secondPloop coincide with store indices of
// firstPloop for the same memrefs.
auto storeIndices = write->second.front();
auto loadIndices = load.indices();
if (storeIndices.size() != loadIndices.size())
return WalkResult::interrupt();
for (int i = 0, e = storeIndices.size(); i < e; ++i) {
if (firstToSecondPloopIndices.lookupOrDefault(storeIndices[i]) !=
loadIndices[i])
return WalkResult::interrupt();
}
return WalkResult::advance();
});
return !walkResult.wasInterrupted();
}
/// Analyzes dependencies in the most primitive way by checking simple read and
/// write patterns.
static LogicalResult
verifyDependencies(ParallelOp firstPloop, ParallelOp secondPloop,
const BlockAndValueMapping &firstToSecondPloopIndices) {
if (!haveNoReadsAfterWriteExceptSameIndex(firstPloop, secondPloop,
firstToSecondPloopIndices))
return failure();
BlockAndValueMapping secondToFirstPloopIndices;
secondToFirstPloopIndices.map(secondPloop.getBody()->getArguments(),
firstPloop.getBody()->getArguments());
return success(haveNoReadsAfterWriteExceptSameIndex(
secondPloop, firstPloop, secondToFirstPloopIndices));
}
static bool
isFusionLegal(ParallelOp firstPloop, ParallelOp secondPloop,
const BlockAndValueMapping &firstToSecondPloopIndices) {
return !hasNestedParallelOp(firstPloop) &&
!hasNestedParallelOp(secondPloop) &&
equalIterationSpaces(firstPloop, secondPloop) &&
succeeded(verifyDependencies(firstPloop, secondPloop,
firstToSecondPloopIndices));
}
/// Prepends operations of firstPloop's body into secondPloop's body.
static void fuseIfLegal(ParallelOp firstPloop, ParallelOp secondPloop,
OpBuilder b) {
BlockAndValueMapping firstToSecondPloopIndices;
firstToSecondPloopIndices.map(firstPloop.getBody()->getArguments(),
secondPloop.getBody()->getArguments());
if (!isFusionLegal(firstPloop, secondPloop, firstToSecondPloopIndices))
return;
b.setInsertionPointToStart(secondPloop.getBody());
for (auto &op : firstPloop.getBody()->without_terminator())
b.clone(op, firstToSecondPloopIndices);
firstPloop.erase();
}
void mlir::scf::naivelyFuseParallelOps(Region ®ion) {
OpBuilder b(region);
// Consider every single block and attempt to fuse adjacent loops.
for (auto &block : region) {
SmallVector<SmallVector<ParallelOp, 8>, 1> ploopChains{{}};
// Not using `walk()` to traverse only top-level parallel loops and also
// make sure that there are no side-effecting ops between the parallel
// loops.
bool noSideEffects = true;
for (auto &op : block) {
if (auto ploop = dyn_cast<ParallelOp>(op)) {
if (noSideEffects) {
ploopChains.back().push_back(ploop);
} else {
ploopChains.push_back({ploop});
noSideEffects = true;
}
continue;
}
// TODO: Handle region side effects properly.
noSideEffects &=
MemoryEffectOpInterface::hasNoEffect(&op) && op.getNumRegions() == 0;
}
for (ArrayRef<ParallelOp> ploops : ploopChains) {
for (int i = 0, e = ploops.size(); i + 1 < e; ++i)
fuseIfLegal(ploops[i], ploops[i + 1], b);
}
}
}
namespace {
struct ParallelLoopFusion
: public SCFParallelLoopFusionBase<ParallelLoopFusion> {
void runOnOperation() override {
getOperation()->walk([&](Operation *child) {
for (Region ®ion : child->getRegions())
naivelyFuseParallelOps(region);
});
}
};
} // namespace
std::unique_ptr<Pass> mlir::createParallelLoopFusionPass() {
return std::make_unique<ParallelLoopFusion>();
}