GPUJIT.h
4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/******************************************************************************/
/* */
/* Part of the LLVM Project, under the Apache License v2.0 with LLVM */
/* Exceptions. */
/* See https://llvm.org/LICENSE.txt for license information. */
/* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */
/* */
/******************************************************************************/
/* */
/* This file defines GPUJIT. */
/* */
/******************************************************************************/
#ifndef GPUJIT_H_
#define GPUJIT_H_
#include "stddef.h"
/*
* The following demostrates how we can use the GPURuntime library to
* execute a GPU kernel.
*
* char KernelString[] = "\n\
* .version 1.4\n\
* .target sm_10, map_f64_to_f32\n\
* .entry _Z8myKernelPi (\n\
* .param .u64 __cudaparm__Z8myKernelPi_data)\n\
* {\n\
* .reg .u16 %rh<4>;\n\
* .reg .u32 %r<5>;\n\
* .reg .u64 %rd<6>;\n\
* cvt.u32.u16 %r1, %tid.x;\n\
* mov.u16 %rh1, %ctaid.x;\n\
* mov.u16 %rh2, %ntid.x;\n\
* mul.wide.u16 %r2, %rh1, %rh2;\n\
* add.u32 %r3, %r1, %r2;\n\
* ld.param.u64 %rd1, [__cudaparm__Z8myKernelPi_data];\n\
* cvt.s64.s32 %rd2, %r3;\n\
* mul.wide.s32 %rd3, %r3, 4;\n\
* add.u64 %rd4, %rd1, %rd3;\n\
* st.global.s32 [%rd4+0], %r3;\n\
* exit;\n\
* }\n\
* ";
*
* const char *Entry = "_Z8myKernelPi";
*
* int main() {
* PollyGPUFunction *Kernel;
* PollyGPUContext *Context;
* PollyGPUDevicePtr *DevArray;
* int *HostData;
* int MemSize;
*
* int GridX = 8;
* int GridY = 8;
*
* int BlockX = 16;
* int BlockY = 16;
* int BlockZ = 1;
*
* MemSize = 256*64*sizeof(int);
* Context = polly_initContext();
* DevArray = polly_allocateMemoryForDevice(MemSize);
* Kernel = polly_getKernel(KernelString, KernelName);
*
* void *Params[1];
* void *DevPtr = polly_getDevicePtr(DevArray)
* Params[0] = &DevPtr;
*
* polly_launchKernel(Kernel, GridX, GridY, BlockX, BlockY, BlockZ, Params);
*
* polly_copyFromDeviceToHost(HostData, DevData, MemSize);
* polly_freeKernel(Kernel);
* polly_freeDeviceMemory(DevArray);
* polly_freeContext(Context);
* }
*
*/
typedef enum PollyGPURuntimeT {
RUNTIME_NONE,
RUNTIME_CUDA,
RUNTIME_CL
} PollyGPURuntime;
typedef struct PollyGPUContextT PollyGPUContext;
typedef struct PollyGPUFunctionT PollyGPUFunction;
typedef struct PollyGPUDevicePtrT PollyGPUDevicePtr;
typedef struct OpenCLContextT OpenCLContext;
typedef struct OpenCLKernelT OpenCLKernel;
typedef struct OpenCLDevicePtrT OpenCLDevicePtr;
typedef struct CUDAContextT CUDAContext;
typedef struct CUDAKernelT CUDAKernel;
typedef struct CUDADevicePtrT CUDADevicePtr;
PollyGPUContext *polly_initContextCUDA();
PollyGPUContext *polly_initContextCL();
PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
const char *KernelName);
void polly_freeKernel(PollyGPUFunction *Kernel);
void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
long MemSize);
void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
long MemSize);
void polly_synchronizeDevice();
void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
unsigned int GridDimY, unsigned int BlockSizeX,
unsigned int BlockSizeY, unsigned int BlockSizeZ,
void **Parameters);
void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation);
void polly_freeContext(PollyGPUContext *Context);
// Note that polly_{malloc/free}Managed are currently not used by Polly.
// We use them in COSMO by replacing all malloc with polly_mallocManaged and all
// frees with cudaFree, so we can get managed memory "automatically".
// Needless to say, this is a hack.
// Please make sure that this code is not present in Polly when 2018 rolls in.
// If this is still present, ping Siddharth Bhat <siddu.druid@gmail.com>
void *polly_mallocManaged(size_t size);
void polly_freeManaged(void *mem);
#endif /* GPUJIT_H_ */