X86Counter.cpp
8.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "X86Counter.h"
// FIXME: Use appropriate wrappers for poll.h and mman.h
// to support Windows and remove this linux-only guard.
#ifdef __linux__
#include "llvm/Support/Endian.h"
#include "llvm/Support/Errc.h"
#ifdef HAVE_LIBPFM
#include "perfmon/perf_event.h"
#include "perfmon/pfmlib.h"
#include "perfmon/pfmlib_perf_event.h"
#endif // HAVE_LIBPFM
#include <atomic>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>
#include <poll.h>
#include <sys/mman.h>
#include <unistd.h>
#if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES)
namespace llvm {
namespace exegesis {
// Number of entries in the LBR.
static constexpr int kLbrEntries = 16;
static constexpr size_t kBufferPages = 8;
static const size_t kDataBufferSize = kBufferPages * getpagesize();
// Waits for the LBR perf events.
static int pollLbrPerfEvent(const int FileDescriptor) {
struct pollfd PollFd;
PollFd.fd = FileDescriptor;
PollFd.events = POLLIN;
PollFd.revents = 0;
return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */);
}
// Copies the data-buffer into Buf, given the pointer to MMapped.
static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
size_t DataSize) {
// First page is reserved for perf_event_mmap_page. Data buffer starts on
// the next page.
char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
// The LBR buffer is a cyclic buffer, we copy data to another buffer.
uint64_t Offset = Tail % kDataBufferSize;
size_t CopySize = kDataBufferSize - Offset;
memcpy(Buf, Start + Offset, CopySize);
if (CopySize >= DataSize)
return;
memcpy(Buf + CopySize, Start, Offset);
return;
}
// Parses the given data-buffer for stats and fill the CycleArray.
// If data has been extracted successfully, also modifies the code to jump
// out the benchmark loop.
static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize,
const void *From, const void *To,
llvm::SmallVector<int64_t, 4> *CycleArray) {
const char *DataPtr = DataBuf;
while (DataPtr < DataBuf + DataSize) {
struct perf_event_header Header;
memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
if (Header.type != PERF_RECORD_SAMPLE) {
// Ignores non-sample records.
DataPtr += Header.size;
continue;
}
DataPtr += sizeof(Header);
uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
DataPtr += sizeof(Count);
struct perf_branch_entry Entry;
memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
// Read the perf_branch_entry array.
for (uint64_t i = 0; i < Count; ++i) {
const uint64_t BlockStart = From == nullptr
? std::numeric_limits<uint64_t>::min()
: reinterpret_cast<uint64_t>(From);
const uint64_t BlockEnd = To == nullptr
? std::numeric_limits<uint64_t>::max()
: reinterpret_cast<uint64_t>(To);
if (BlockStart <= Entry.from && BlockEnd >= Entry.to)
CycleArray->push_back(Entry.cycles);
if (i == Count - 1)
// We've reached the last entry.
return llvm::Error::success();
// Advance to next entry
DataPtr += sizeof(Entry);
memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
}
}
return llvm::make_error<llvm::StringError>("Unable to parse databuffer.",
llvm::errc::io_error);
}
X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
EventString = "BR_INST_RETIRED.NEAR_TAKEN";
Attr = new perf_event_attr();
Attr->size = sizeof(*Attr);
Attr->type = PERF_TYPE_RAW;
// FIXME This is SKL's encoding. Not sure if it'll change.
Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN
Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
// Don't need to specify "USER" because we've already excluded HV and Kernel.
Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY;
Attr->sample_period = SamplingPeriod;
Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
Attr->disabled = 1;
Attr->exclude_kernel = 1;
Attr->exclude_hv = 1;
Attr->read_format = PERF_FORMAT_GROUP;
FullQualifiedEventString = EventString;
}
X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent)
: Counter(std::move(NewEvent)) {
// First page is reserved for perf_event_mmap_page. Data buffer starts on
// the next page, so we allocate one more page.
MMappedBuffer = mmap(nullptr, (kBufferPages + 1) * getpagesize(),
PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0);
if (MMappedBuffer == MAP_FAILED)
llvm::errs() << "Failed to mmap buffer.";
}
X86LbrCounter::~X86LbrCounter() { close(FileDescriptor); }
void X86LbrCounter::start() {
ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
}
llvm::Error X86LbrCounter::checkLbrSupport() {
// Do a sample read and check if the results contain non-zero values.
X86LbrCounter counter(X86LbrPerfEvent(123));
counter.start();
// Prevent the compiler from unrolling the loop and get rid of all the
// branches. We need at least 16 iterations.
int Sum = 0;
int V = 1;
volatile int *P = &V;
auto TimeLimit =
std::chrono::high_resolution_clock::now() + std::chrono::microseconds(5);
for (int I = 0;
I < kLbrEntries || std::chrono::high_resolution_clock::now() < TimeLimit;
++I) {
Sum += *P;
}
counter.stop();
auto ResultOrError = counter.doReadCounter(nullptr, nullptr);
if (ResultOrError)
if (!ResultOrError.get().empty())
// If there is at least one non-zero entry, then LBR is supported.
for (const int64_t &Value : ResultOrError.get())
if (Value != 0)
return Error::success();
return llvm::make_error<llvm::StringError>(
"LBR format with cycles is not suppported on the host.",
llvm::errc::not_supported);
}
llvm::Expected<llvm::SmallVector<int64_t, 4>>
X86LbrCounter::readOrError(StringRef FunctionBytes) const {
// Disable the event before reading
ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0);
// Find the boundary of the function so that we could filter the LBRs
// to keep only the relevant records.
if (FunctionBytes.empty())
return llvm::make_error<llvm::StringError>("Empty function bytes",
llvm::errc::invalid_argument);
const void *From = reinterpret_cast<const void *>(FunctionBytes.data());
const void *To = reinterpret_cast<const void *>(FunctionBytes.data() +
FunctionBytes.size());
return doReadCounter(From, To);
}
llvm::Expected<llvm::SmallVector<int64_t, 4>>
X86LbrCounter::doReadCounter(const void *From, const void *To) const {
// The max number of time-outs/retries before we give up.
static constexpr int kMaxTimeouts = 160;
// Parses the LBR buffer and fills CycleArray with the sequence of cycle
// counts from the buffer.
llvm::SmallVector<int64_t, 4> CycleArray;
auto DataBuf = std::make_unique<char[]>(kDataBufferSize);
int NumTimeouts = 0;
int PollResult = 0;
while (PollResult <= 0) {
PollResult = pollLbrPerfEvent(FileDescriptor);
if (PollResult > 0)
break;
if (PollResult == -1)
return llvm::make_error<llvm::StringError>("Cannot poll LBR perf event.",
llvm::errc::io_error);
if (NumTimeouts++ >= kMaxTimeouts)
return llvm::make_error<llvm::StringError>(
"LBR polling still timed out after max number of attempts.",
llvm::errc::device_or_resource_busy);
}
struct perf_event_mmap_page Page;
memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
const uint64_t DataTail = Page.data_tail;
const uint64_t DataHead = Page.data_head;
// We're supposed to use a barrier after reading data_head.
std::atomic_thread_fence(std::memory_order_acq_rel);
const size_t DataSize = DataHead - DataTail;
if (DataSize > kDataBufferSize)
return llvm::make_error<llvm::StringError>(
"DataSize larger than buffer size.", llvm::errc::invalid_argument);
copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
llvm::Error error =
parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray);
if (!error)
return CycleArray;
return std::move(error);
}
} // namespace exegesis
} // namespace llvm
#endif // defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES)
#endif // __linux__