sse_partial_update.ll
5.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
; rdar: 12558838
; PR14221
; There is a mismatch between the intrinsic and the actual instruction.
; The actual instruction has a partial update of dest, while the intrinsic
; passes through the upper FP values. Here, we make sure the source and
; destination of each scalar unary op are the same.
define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp {
; CHECK-LABEL: rsqrtss:
; CHECK: ## %bb.0:
; CHECK-NEXT: rsqrtss %xmm0, %xmm0
; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: jmp _callee ## TAILCALL
%t0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %t0, i32 0
%conv = fpext float %a.addr.0.extract to double
%a.addr.4.extract = extractelement <4 x float> %t0, i32 1
%conv3 = fpext float %a.addr.4.extract to double
tail call void @callee(double %conv, double %conv3) nounwind
ret void
}
declare void @callee(double, double)
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define void @rcpss(<4 x float> %a) nounwind uwtable ssp {
; CHECK-LABEL: rcpss:
; CHECK: ## %bb.0:
; CHECK-NEXT: rcpss %xmm0, %xmm0
; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: jmp _callee ## TAILCALL
%t0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %t0, i32 0
%conv = fpext float %a.addr.0.extract to double
%a.addr.4.extract = extractelement <4 x float> %t0, i32 1
%conv3 = fpext float %a.addr.4.extract to double
tail call void @callee(double %conv, double %conv3) nounwind
ret void
}
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define void @sqrtss(<4 x float> %a) nounwind uwtable ssp {
; CHECK-LABEL: sqrtss:
; CHECK: ## %bb.0:
; CHECK-NEXT: sqrtss %xmm0, %xmm1
; CHECK-NEXT: cvtss2sd %xmm1, %xmm2
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtss2sd %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: jmp _callee ## TAILCALL
%t0 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind
%a.addr.0.extract = extractelement <4 x float> %t0, i32 0
%conv = fpext float %a.addr.0.extract to double
%a.addr.4.extract = extractelement <4 x float> %t0, i32 1
%conv3 = fpext float %a.addr.4.extract to double
tail call void @callee(double %conv, double %conv3) nounwind
ret void
}
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {
; CHECK-LABEL: sqrtsd:
; CHECK: ## %bb.0:
; CHECK-NEXT: sqrtsd %xmm0, %xmm1
; CHECK-NEXT: cvtsd2ss %xmm1, %xmm2
; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: jmp _callee2 ## TAILCALL
%t0 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a) nounwind
%a0 = extractelement <2 x double> %t0, i32 0
%conv = fptrunc double %a0 to float
%a1 = extractelement <2 x double> %t0, i32 1
%conv3 = fptrunc double %a1 to float
tail call void @callee2(float %conv, float %conv3) nounwind
ret void
}
declare void @callee2(float, float)
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
; CHECK-LABEL: load_fold_cvtss2sd_int:
; CHECK: ## %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
}
define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
; CHECK-LABEL: load_fold_cvtss2sd_int_optsize:
; CHECK: ## %bb.0:
; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
}
define <2 x double> @load_fold_cvtss2sd_int_minsize(<4 x float> *%a) minsize {
; CHECK-LABEL: load_fold_cvtss2sd_int_minsize:
; CHECK: ## %bb.0:
; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: retq
%ld = load <4 x float>, <4 x float> *%a
%x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld)
ret <2 x double> %x
}
declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
define float @PR22206(<4 x float> %a) {
; CHECK-LABEL: PR22206:
; CHECK: ## %bb.0:
; CHECK-NEXT: sqrtss %xmm0, %xmm1
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: retq
%res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind
%new = extractelement <4 x float> %res, i32 0
%orig = extractelement <4 x float> %a, i32 0
%add = fadd float %new, %orig
ret float %add
}