lds-output-queue.ll
3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
;
; This test checks that the lds input queue will is empty at the end of
; the ALU clause.
; CHECK-LABEL: {{^}}lds_input_queue:
; CHECK: LDS_READ_RET * OQAP
; CHECK-NOT: ALU clause
; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
@local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
entry:
%0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
%1 = load i32, i32 addrspace(3)* %0
call void @llvm.r600.group.barrier()
; This will start a new clause for the vertex fetch
%2 = load i32, i32 addrspace(1)* %in
%3 = add i32 %1, %2
store i32 %3, i32 addrspace(1)* %out
ret void
}
declare void @llvm.r600.group.barrier() nounwind convergent
; The machine scheduler does not do proper alias analysis and assumes that
; loads from global values (Note that a global value is different that a
; value from global memory. A global value is a value that is declared
; outside of a function, it can reside in any address space) alias with
; all other loads.
;
; This is a problem for scheduling the reads from the local data share (lds).
; These reads are implemented using two instructions. The first copies the
; data from lds into the lds output queue, and the second moves the data from
; the input queue into main memory. These two instructions don't have to be
; scheduled one after the other, but they do need to be scheduled in the same
; clause. The aliasing problem mentioned above causes problems when there is a
; load from global memory which immediately follows a load from a global value that
; has been declared in the local memory space:
;
; %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
; %1 = load i32, i32 addrspace(3)* %0
; %2 = load i32, i32 addrspace(1)* %in
;
; The instruction selection phase will generate ISA that looks like this:
; %oqap = LDS_READ_RET
; %0 = MOV %oqap
; %1 = VTX_READ_32
; %2 = ADD_INT %1, %0
;
; The bottom scheduler will schedule the two ALU instructions first:
;
; UNSCHEDULED:
; %oqap = LDS_READ_RET
; %1 = VTX_READ_32
;
; SCHEDULED:
;
; %0 = MOV %oqap
; %2 = ADD_INT %1, %2
;
; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
; to consider the global memory read (VTX_READ_32) has a chain dependency, so
; the global memory read will always be scheduled first. This will give us a
; final program which looks like this:
;
; Alu clause:
; %oqap = LDS_READ_RET
; VTX clause:
; %1 = VTX_READ_32
; Alu clause:
; %0 = MOV %oqap
; %2 = ADD_INT %1, %2
;
; This is an illegal program because the oqap def and use know occur in
; different ALU clauses.
;
; This test checks this scenario and makes sure it doesn't result in an
; illegal program. For now, we have fixed this issue by merging the
; LDS_READ_RET and MOV together during instruction selection and then
; expanding them after scheduling. Once the scheduler has better alias
; analysis, we should be able to keep these instructions sparate before
; scheduling.
;
; CHECK-LABEL: {{^}}local_global_alias:
; CHECK: LDS_READ_RET
; CHECK-NOT: ALU clause
; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
%1 = load i32, i32 addrspace(3)* %0
%2 = load i32, i32 addrspace(1)* %in
%3 = add i32 %2, %1
store i32 %3, i32 addrspace(1)* %out
ret void
}