simple_example.cu
3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
//===--- simple_example.cu - Simple example of using Acxxel ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// This file is a simple example of using Acxxel.
///
//===----------------------------------------------------------------------===//
/// [Example simple saxpy]
#include "acxxel.h"
#include <array>
#include <cstdio>
#include <cstdlib>
// A standard CUDA kernel.
__global__ void saxpyKernel(float A, float *X, float *Y, int N) {
int I = (blockDim.x * blockIdx.x) + threadIdx.x;
if (I < N)
X[I] = A * X[I] + Y[I];
}
// A host library wrapping the CUDA kernel. All Acxxel calls are in here.
template <size_t N>
void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
// Get the CUDA platform and make a CUDA stream.
acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();
acxxel::Stream Stream = CUDA->createStream().takeValue();
// Allocate space for device arrays.
auto DeviceX = CUDA->mallocD<float>(N).takeValue();
auto DeviceY = CUDA->mallocD<float>(N).takeValue();
// Copy X and Y out to the device.
Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);
// Launch the kernel using triple-chevron notation.
saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);
// Copy the results back to the host.
acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();
// Check for any errors.
if (Status.isError()) {
std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",
Status.getMessage().c_str());
std::exit(EXIT_FAILURE);
}
}
/// [Example simple saxpy]
/// [Example CUDA simple saxpy]
template <size_t N>
void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {
// This size is needed all over the place, so give it a name.
constexpr size_t Size = N * sizeof(float);
// Allocate space for device arrays.
float *DeviceX;
float *DeviceY;
cudaMalloc(&DeviceX, Size);
cudaMalloc(&DeviceY, Size);
// Copy X and Y out to the device.
cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);
cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);
// Launch the kernel using triple-chevron notation.
saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);
// Copy the results back to the host.
cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);
// Free resources.
cudaFree(DeviceX);
cudaFree(DeviceY);
// Check for any errors.
cudaError_t Error = cudaGetLastError();
if (Error) {
std::fprintf(stderr, "Error performing cudart saxpy: %s\n",
cudaGetErrorString(Error));
std::exit(EXIT_FAILURE);
}
}
/// [Example CUDA simple saxpy]
template <typename F> void testSaxpy(F &&SaxpyFunction) {
float A = 2.f;
std::array<float, 3> X = {{0.f, 1.f, 2.f}};
std::array<float, 3> Y = {{3.f, 4.f, 5.f}};
std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};
SaxpyFunction(A, X, Y);
for (int I = 0; I < 3; ++I)
if (X[I] != Expected[I]) {
std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],
Expected[I]);
std::exit(EXIT_FAILURE);
}
}
int main() {
testSaxpy(saxpy<3>);
testSaxpy(cudaSaxpy<3>);
}