udivmodti4.c
4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
//===-- udivmodti4.c - Implement __udivmodti4 -----------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements __udivmodti4 for the compiler_rt library.
//
//===----------------------------------------------------------------------===//
#include "int_lib.h"
#ifdef CRT_HAS_128BIT
// Returns the 128 bit division result by 64 bit. Result must fit in 64 bits.
// Remainder stored in r.
// Taken and adjusted from libdivide libdivide_128_div_64_to_64 division
// fallback. For a correctness proof see the reference for this algorithm
// in Knuth, Volume 2, section 4.3.1, Algorithm D.
UNUSED
static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v,
du_int *r) {
const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits)
du_int un1, un0; // Norm. dividend LSD's
du_int vn1, vn0; // Norm. divisor digits
du_int q1, q0; // Quotient digits
du_int un64, un21, un10; // Dividend digit pairs
du_int rhat; // A remainder
si_int s; // Shift amount for normalization
s = __builtin_clzll(v);
if (s > 0) {
// Normalize the divisor.
v = v << s;
un64 = (u1 << s) | (u0 >> (n_udword_bits - s));
un10 = u0 << s; // Shift dividend left
} else {
// Avoid undefined behavior of (u0 >> 64).
un64 = u1;
un10 = u0;
}
// Break divisor up into two 32-bit digits.
vn1 = v >> (n_udword_bits / 2);
vn0 = v & 0xFFFFFFFF;
// Break right half of dividend into two digits.
un1 = un10 >> (n_udword_bits / 2);
un0 = un10 & 0xFFFFFFFF;
// Compute the first quotient digit, q1.
q1 = un64 / vn1;
rhat = un64 - q1 * vn1;
// q1 has at most error 2. No more than 2 iterations.
while (q1 >= b || q1 * vn0 > b * rhat + un1) {
q1 = q1 - 1;
rhat = rhat + vn1;
if (rhat >= b)
break;
}
un21 = un64 * b + un1 - q1 * v;
// Compute the second quotient digit.
q0 = un21 / vn1;
rhat = un21 - q0 * vn1;
// q0 has at most error 2. No more than 2 iterations.
while (q0 >= b || q0 * vn0 > b * rhat + un0) {
q0 = q0 - 1;
rhat = rhat + vn1;
if (rhat >= b)
break;
}
*r = (un21 * b + un0 - q0 * v) >> s;
return q1 * b + q0;
}
static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v,
du_int *r) {
#if defined(__x86_64__)
du_int result;
__asm__("divq %[v]"
: "=a"(result), "=d"(*r)
: [ v ] "r"(v), "a"(u0), "d"(u1));
return result;
#else
return udiv128by64to64default(u1, u0, v, r);
#endif
}
// Effects: if rem != 0, *rem = a % b
// Returns: a / b
COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) {
const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
utwords dividend;
dividend.all = a;
utwords divisor;
divisor.all = b;
utwords quotient;
utwords remainder;
if (divisor.all > dividend.all) {
if (rem)
*rem = dividend.all;
return 0;
}
// When the divisor fits in 64 bits, we can use an optimized path.
if (divisor.s.high == 0) {
remainder.s.high = 0;
if (dividend.s.high < divisor.s.low) {
// The result fits in 64 bits.
quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
divisor.s.low, &remainder.s.low);
quotient.s.high = 0;
} else {
// First, divide with the high part to get the remainder in dividend.s.high.
// After that dividend.s.high < divisor.s.low.
quotient.s.high = dividend.s.high / divisor.s.low;
dividend.s.high = dividend.s.high % divisor.s.low;
quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
divisor.s.low, &remainder.s.low);
}
if (rem)
*rem = remainder.all;
return quotient.all;
}
// 0 <= shift <= 63.
si_int shift =
__builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high);
divisor.all <<= shift;
quotient.s.high = 0;
quotient.s.low = 0;
for (; shift >= 0; --shift) {
quotient.s.low <<= 1;
// Branch free version of.
// if (dividend.all >= divisor.all)
// {
// dividend.all -= divisor.all;
// carry = 1;
// }
const ti_int s =
(ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1);
quotient.s.low |= s & 1;
dividend.all -= divisor.all & s;
divisor.all >>= 1;
}
if (rem)
*rem = dividend.all;
return quotient.all;
}
#endif // CRT_HAS_128BIT