hypotf.cpp
6.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
//===-- Implementation of hypotf function ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/__support/common.h"
#include "utils/FPUtil/BasicOperations.h"
#include "utils/FPUtil/FPBits.h"
namespace __llvm_libc {
using namespace fputil;
uint32_t findLeadingOne(uint32_t mant, int &shift_length) {
shift_length = 0;
constexpr int nsteps = 5;
constexpr uint32_t bounds[nsteps] = {1 << 16, 1 << 8, 1 << 4, 1 << 2, 1 << 1};
constexpr int shifts[nsteps] = {16, 8, 4, 2, 1};
for (int i = 0; i < nsteps; ++i) {
if (mant >= bounds[i]) {
shift_length += shifts[i];
mant >>= shifts[i];
}
}
return 1U << shift_length;
}
// Correctly rounded IEEE 754 HYPOT(x, y) with round to nearest, ties to even.
//
// Algorithm:
// - Let a = max(|x|, |y|), b = min(|x|, |y|), then we have that:
// a <= sqrt(a^2 + b^2) <= min(a + b, a*sqrt(2))
// 1. So if b < eps(a)/2, then HYPOT(x, y) = a.
//
// - Moreover, the exponent part of HYPOT(x, y) is either the same or 1 more
// than the exponent part of a.
//
// 2. For the remaining cases, we will use the digit-by-digit (shift-and-add)
// algorithm to compute SQRT(Z):
//
// - For Y = y0.y1...yn... = SQRT(Z),
// let Y(n) = y0.y1...yn be the first n fractional digits of Y.
//
// - The nth scaled residual R(n) is defined to be:
// R(n) = 2^n * (Z - Y(n)^2)
//
// - Since Y(n) = Y(n - 1) + yn * 2^(-n), the scaled residual
// satisfies the following recurrence formula:
// R(n) = 2*R(n - 1) - yn*(2*Y(n - 1) + 2^(-n)),
// with the initial conditions:
// Y(0) = y0, and R(0) = Z - y0.
//
// - So the nth fractional digit of Y = SQRT(Z) can be decided by:
// yn = 1 if 2*R(n - 1) >= 2*Y(n - 1) + 2^(-n),
// 0 otherwise.
//
// 3. Precision analysis:
//
// - Notice that in the decision function:
// 2*R(n - 1) >= 2*Y(n - 1) + 2^(-n),
// the right hand side only uses up to the 2^(-n)-bit, and both sides are
// non-negative, so R(n - 1) can be truncated at the 2^(-(n + 1))-bit, so
// that 2*R(n - 1) is corrected up to the 2^(-n)-bit.
//
// - Thus, in order to round SQRT(a^2 + b^2) correctly up to n-fractional
// bits, we need to perform the summation (a^2 + b^2) correctly up to (2n +
// 2)-fractional bits, and the remaining bits are sticky bits (i.e. we only
// care if they are 0 or > 0), and the comparisons, additions/subtractions
// can be done in n-fractional bits precision.
//
// - For single precision (float), we can use uint64_t to store the sum a^2 +
// b^2 exact up to (2n + 2)-fractional bits.
//
// - Then we can feed this sum into the digit-by-digit algorithm for SQRT(Z)
// described above.
//
//
// Special cases:
// - HYPOT(x, y) is +Inf if x or y is +Inf or -Inf; else
// - HYPOT(x, y) is NaN if x or y is NaN.
//
float LLVM_LIBC_ENTRYPOINT(hypotf)(float x, float y) {
FPBits<float> x_bits(x), y_bits(y);
if (x_bits.isInf() || y_bits.isInf()) {
return FPBits<float>::inf();
}
if (x_bits.isNaN()) {
return x;
}
if (y_bits.isNaN()) {
return y;
}
uint16_t a_exp, b_exp, out_exp;
uint32_t a_mant, b_mant;
uint64_t a_mant_sq, b_mant_sq;
bool sticky_bits;
if ((x_bits.exponent >= y_bits.exponent + MantissaWidth<float>::value + 2) ||
(y == 0)) {
return abs(x);
} else if ((y_bits.exponent >=
x_bits.exponent + MantissaWidth<float>::value + 2) ||
(x == 0)) {
y_bits.sign = 0;
return abs(y);
}
if (x >= y) {
a_exp = x_bits.exponent;
a_mant = x_bits.mantissa;
b_exp = y_bits.exponent;
b_mant = y_bits.mantissa;
} else {
a_exp = y_bits.exponent;
a_mant = y_bits.mantissa;
b_exp = x_bits.exponent;
b_mant = x_bits.mantissa;
}
out_exp = a_exp;
// Add an extra bit to simplify the final rounding bit computation.
constexpr uint32_t one = 1U << (MantissaWidth<float>::value + 1);
a_mant <<= 1;
b_mant <<= 1;
uint32_t leading_one;
int y_mant_width;
if (a_exp != 0) {
leading_one = one;
a_mant |= one;
y_mant_width = MantissaWidth<float>::value + 1;
} else {
leading_one = findLeadingOne(a_mant, y_mant_width);
}
if (b_exp != 0) {
b_mant |= one;
}
a_mant_sq = static_cast<uint64_t>(a_mant) * a_mant;
b_mant_sq = static_cast<uint64_t>(b_mant) * b_mant;
// At this point, a_exp >= b_exp > a_exp - 25, so in order to line up aSqMant
// and bSqMant, we need to shift bSqMant to the right by (a_exp - b_exp) bits.
// But before that, remember to store the losing bits to sticky.
// The shift length is for a^2 and b^2, so it's double of the exponent
// difference between a and b.
uint16_t shift_length = 2 * (a_exp - b_exp);
sticky_bits = ((b_mant_sq & ((1ULL << shift_length) - 1)) != 0);
b_mant_sq >>= shift_length;
uint64_t sum = a_mant_sq + b_mant_sq;
if (sum >= (1ULL << (2 * y_mant_width + 2))) {
// a^2 + b^2 >= 4* leading_one^2, so we will need an extra bit to the left.
if (leading_one == one) {
// For normal result, we discard the last 2 bits of the sum and increase
// the exponent.
sticky_bits = sticky_bits || ((sum & 0x3U) != 0);
sum >>= 2;
++out_exp;
if (out_exp >= FPBits<float>::maxExponent) {
return FPBits<float>::inf();
}
} else {
// For denormal result, we simply move the leading bit of the result to
// the left by 1.
leading_one <<= 1;
++y_mant_width;
}
}
uint32_t Y = leading_one;
uint32_t R = static_cast<uint32_t>(sum >> y_mant_width) - leading_one;
uint32_t tailBits = static_cast<uint32_t>(sum) & (leading_one - 1);
for (uint32_t current_bit = leading_one >> 1; current_bit;
current_bit >>= 1) {
R = (R << 1) + ((tailBits & current_bit) ? 1 : 0);
uint32_t tmp = (Y << 1) + current_bit; // 2*y(n - 1) + 2^(-n)
if (R >= tmp) {
R -= tmp;
Y += current_bit;
}
}
bool round_bit = Y & 1U;
bool lsb = Y & 2U;
if (Y >= one) {
Y -= one;
if (out_exp == 0) {
out_exp = 1;
}
}
Y >>= 1;
// Round to the nearest, tie to even.
if (round_bit && (lsb || sticky_bits || (R != 0))) {
++Y;
}
if (Y >= (one >> 1)) {
Y -= one >> 1;
++out_exp;
if (out_exp >= FPBits<float>::maxExponent) {
return FPBits<float>::inf();
}
}
Y |= static_cast<uint32_t>(out_exp) << MantissaWidth<float>::value;
return *reinterpret_cast<float *>(&Y);
}
} // namespace __llvm_libc