dffma.S 13.4 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694
//===----------------------Hexagon builtin routine ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
#define END(TAG) .size TAG,.-TAG

// Double Precision Multiply


#define A r1:0
#define AH r1
#define AL r0
#define B r3:2
#define BH r3
#define BL r2
#define C r5:4
#define CH r5
#define CL r4



#define BTMP r15:14
#define BTMPH r15
#define BTMPL r14

#define ATMP r13:12
#define ATMPH r13
#define ATMPL r12

#define CTMP r11:10
#define CTMPH r11
#define CTMPL r10

#define PP_LL r9:8
#define PP_LL_H r9
#define PP_LL_L r8

#define PP_ODD r7:6
#define PP_ODD_H r7
#define PP_ODD_L r6


#define PP_HH r17:16
#define PP_HH_H r17
#define PP_HH_L r16

#define EXPA r18
#define EXPB r19
#define EXPBA r19:18

#define TMP r28

#define P_TMP p0
#define PROD_NEG p3
#define EXACT p2
#define SWAP p1

#define MANTBITS 52
#define HI_MANTBITS 20
#define EXPBITS 11
#define BIAS 1023
#define STACKSPACE 32

#define ADJUST 4

#define FUDGE 7
#define FUDGE2 3

#ifndef SR_ROUND_OFF
#define SR_ROUND_OFF 22
#endif

	// First, classify for normal values, and abort if abnormal
	//
	// Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
	//
	// Since we know that the 2 MSBs of the H registers is zero, we should never carry
	// the partial products that involve the H registers
	//
	// Try to buy X slots, at the expense of latency if needed
	//
	// We will have PP_HH with the upper bits of the product, PP_LL with the lower
	// PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
	// PP_HH can have a minimum of 0x0100_0000_0000_0000
	//
	// 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
	//
	// We need to align CTMP.
	// If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
	// If CTMP << PP align CTMP and add 128 bits.  Then compute sticky
	// If CTMP ~= PP, align CTMP and add 128 bits.  May have massive cancellation.
	//
	// Convert partial product and CTMP to 2's complement prior to addition
	//
	// After we add, we need to normalize into upper 64 bits, then compute sticky.

	.text
	.global __hexagon_fmadf4
        .type __hexagon_fmadf4,@function
	.global __hexagon_fmadf5
        .type __hexagon_fmadf5,@function
	Q6_ALIAS(fmadf5)
	.p2align 5
__hexagon_fmadf4:
__hexagon_fmadf5:
.Lfma_begin:
	{
		P_TMP = dfclass(A,#2)
		P_TMP = dfclass(B,#2)
		ATMP = #0
		BTMP = #0
	}
	{
		ATMP = insert(A,#MANTBITS,#EXPBITS-3)
		BTMP = insert(B,#MANTBITS,#EXPBITS-3)
		PP_ODD_H = ##0x10000000
		allocframe(#STACKSPACE)
	}
	{
		PP_LL = mpyu(ATMPL,BTMPL)
		if (!P_TMP) jump .Lfma_abnormal_ab
		ATMPH = or(ATMPH,PP_ODD_H)
		BTMPH = or(BTMPH,PP_ODD_H)
	}
	{
		P_TMP = dfclass(C,#2)
		if (!P_TMP.new) jump:nt .Lfma_abnormal_c
		CTMP = combine(PP_ODD_H,#0)
		PP_ODD = combine(#0,PP_LL_H)
	}
.Lfma_abnormal_c_restart:
	{
		PP_ODD += mpyu(BTMPL,ATMPH)
		CTMP = insert(C,#MANTBITS,#EXPBITS-3)
		memd(r29+#0) = PP_HH
		memd(r29+#8) = EXPBA
	}
	{
		PP_ODD += mpyu(ATMPL,BTMPH)
		EXPBA = neg(CTMP)
		P_TMP = cmp.gt(CH,#-1)
		TMP = xor(AH,BH)
	}
	{
		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
		EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
		PP_HH = combine(#0,PP_ODD_H)
		if (!P_TMP) CTMP = EXPBA
	}
	{
		PP_HH += mpyu(ATMPH,BTMPH)
		PP_LL = combine(PP_ODD_L,PP_LL_L)
#undef PP_ODD
#undef PP_ODD_H
#undef PP_ODD_L
#undef ATMP
#undef ATMPL
#undef ATMPH
#undef BTMP
#undef BTMPL
#undef BTMPH
#define RIGHTLEFTSHIFT r13:12
#define RIGHTSHIFT r13
#define LEFTSHIFT r12

		EXPA = add(EXPA,EXPB)
#undef EXPB
#undef EXPBA
#define EXPC r19
#define EXPCA r19:18
		EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
	}
	// PP_HH:PP_LL now has product
	// CTMP is negated
	// EXPA,B,C are extracted
	// We need to negate PP
	// Since we will be adding with carry later, if we need to negate,
	// just invert all bits now, which we can do conditionally and in parallel
#define PP_HH_TMP r15:14
#define PP_LL_TMP r7:6
	{
		EXPA = add(EXPA,#-BIAS+(ADJUST))
		PROD_NEG = !cmp.gt(TMP,#-1)
		PP_LL_TMP = #0
		PP_HH_TMP = #0
	}
	{
		PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
		P_TMP = !cmp.gt(TMP,#-1)
		SWAP = cmp.gt(EXPC,EXPA)	// If C >> PP
		if (SWAP.new) EXPCA = combine(EXPA,EXPC)
	}
	{
		PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
		if (P_TMP) PP_LL = PP_LL_TMP
#undef PP_LL_TMP
#define CTMP2 r7:6
#define CTMP2H r7
#define CTMP2L r6
		CTMP2 = #0
		EXPC = sub(EXPA,EXPC)
	}
	{
		if (P_TMP) PP_HH = PP_HH_TMP
		P_TMP = cmp.gt(EXPC,#63)
		if (SWAP) PP_LL = CTMP2
		if (SWAP) CTMP2 = PP_LL
	}
#undef PP_HH_TMP
//#define ONE r15:14
//#define S_ONE r14
#define ZERO r15:14
#define S_ZERO r15
#undef PROD_NEG
#define P_CARRY p3
	{
		if (SWAP) PP_HH = CTMP	// Swap C and PP
		if (SWAP) CTMP = PP_HH
		if (P_TMP) EXPC = add(EXPC,#-64)
		TMP = #63
	}
	{
		// If diff > 63, pre-shift-right by 64...
		if (P_TMP) CTMP2 = CTMP
		TMP = asr(CTMPH,#31)
		RIGHTSHIFT = min(EXPC,TMP)
		LEFTSHIFT = #0
	}
#undef C
#undef CH
#undef CL
#define STICKIES r5:4
#define STICKIESH r5
#define STICKIESL r4
	{
		if (P_TMP) CTMP = combine(TMP,TMP)	// sign extension of pre-shift-right-64
		STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
		CTMP2 = lsr(CTMP2,RIGHTSHIFT)
		LEFTSHIFT = sub(#64,RIGHTSHIFT)
	}
	{
		ZERO = #0
		TMP = #-2
		CTMP2 |= lsl(CTMP,LEFTSHIFT)
		CTMP = asr(CTMP,RIGHTSHIFT)
	}
	{
		P_CARRY = cmp.gtu(STICKIES,ZERO)	// If we have sticky bits from C shift
		if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
#undef ZERO
#define ONE r15:14
#define S_ONE r14
		ONE = #1
		STICKIES = #0
	}
	{
		PP_LL = add(CTMP2,PP_LL,P_CARRY):carry	// use the carry to add the sticky
	}
	{
		PP_HH = add(CTMP,PP_HH,P_CARRY):carry
		TMP = #62
	}
	// PP_HH:PP_LL now holds the sum
	// We may need to normalize left, up to ??? bits.
	//
	// I think that if we have massive cancellation, the range we normalize by
	// is still limited
	{
		LEFTSHIFT = add(clb(PP_HH),#-2)
		if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f	// all sign bits?
	}
	// We had all sign bits, shift left by 62.
	{
		CTMP = extractu(PP_LL,#62,#2)
		PP_LL = asl(PP_LL,#62)
		EXPA = add(EXPA,#-62)			// And adjust exponent of result
	}
	{
		PP_HH = insert(CTMP,#62,#0)		// Then shift 63
	}
	{
		LEFTSHIFT = add(clb(PP_HH),#-2)
	}
	.falign
1:
	{
		CTMP = asl(PP_HH,LEFTSHIFT)
		STICKIES |= asl(PP_LL,LEFTSHIFT)
		RIGHTSHIFT = sub(#64,LEFTSHIFT)
		EXPA = sub(EXPA,LEFTSHIFT)
	}
	{
		CTMP |= lsr(PP_LL,RIGHTSHIFT)
		EXACT = cmp.gtu(ONE,STICKIES)
		TMP = #BIAS+BIAS-2
	}
	{
		if (!EXACT) CTMPL = or(CTMPL,S_ONE)
		// If EXPA is overflow/underflow, jump to ovf_unf
		P_TMP = !cmp.gt(EXPA,TMP)
		P_TMP = cmp.gt(EXPA,#1)
		if (!P_TMP.new) jump:nt .Lfma_ovf_unf
	}
	{
		// XXX: FIXME: should PP_HH for check of zero be CTMP?
		P_TMP = cmp.gtu(ONE,CTMP)		// is result true zero?
		A = convert_d2df(CTMP)
		EXPA = add(EXPA,#-BIAS-60)
		PP_HH = memd(r29+#0)
	}
	{
		AH += asl(EXPA,#HI_MANTBITS)
		EXPCA = memd(r29+#8)
		if (!P_TMP) dealloc_return		// not zero, return
	}
.Ladd_yields_zero:
	// We had full cancellation.  Return +/- zero (-0 when round-down)
	{
		TMP = USR
		A = #0
	}
	{
		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
		PP_HH = memd(r29+#0)
		EXPCA = memd(r29+#8)
	}
	{
		p0 = cmp.eq(TMP,#2)
		if (p0.new) AH = ##0x80000000
		dealloc_return
	}

#undef RIGHTLEFTSHIFT
#undef RIGHTSHIFT
#undef LEFTSHIFT
#undef CTMP2
#undef CTMP2H
#undef CTMP2L

.Lfma_ovf_unf:
	{
		p0 = cmp.gtu(ONE,CTMP)
		if (p0.new) jump:nt .Ladd_yields_zero
	}
	{
		A = convert_d2df(CTMP)
		EXPA = add(EXPA,#-BIAS-60)
		TMP = EXPA
	}
#define NEW_EXPB r7
#define NEW_EXPA r6
	{
		AH += asl(EXPA,#HI_MANTBITS)
		NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
	}
	{
		NEW_EXPA = add(EXPA,NEW_EXPB)
		PP_HH = memd(r29+#0)
		EXPCA = memd(r29+#8)
#undef PP_HH
#undef PP_HH_H
#undef PP_HH_L
#undef EXPCA
#undef EXPC
#undef EXPA
#undef PP_LL
#undef PP_LL_H
#undef PP_LL_L
#define EXPA r6
#define EXPB r7
#define EXPBA r7:6
#define ATMP r9:8
#define ATMPH r9
#define ATMPL r8
#undef NEW_EXPB
#undef NEW_EXPA
		ATMP = abs(CTMP)
	}
	{
		p0 = cmp.gt(EXPA,##BIAS+BIAS)
		if (p0.new) jump:nt .Lfma_ovf
	}
	{
		p0 = cmp.gt(EXPA,#0)
		if (p0.new) jump:nt .Lpossible_unf
	}
	{
		// TMP has original EXPA.
		// ATMP is corresponding value
		// Normalize ATMP and shift right to correct location
		EXPB = add(clb(ATMP),#-2)		// Amount to left shift to normalize
		EXPA = sub(#1+5,TMP)			// Amount to right shift to denormalize
		p3 = cmp.gt(CTMPH,#-1)
	}
	// Underflow
	// We know that the infinte range exponent should be EXPA
	// CTMP is 2's complement, ATMP is abs(CTMP)
	{
		EXPA = add(EXPA,EXPB)		// how much to shift back right
		ATMP = asl(ATMP,EXPB)		// shift left
		AH = USR
		TMP = #63
	}
	{
		EXPB = min(EXPA,TMP)
		EXPA = #0
		AL = #0x0030
	}
	{
		B = extractu(ATMP,EXPBA)
		ATMP = asr(ATMP,EXPB)
	}
	{
		p0 = cmp.gtu(ONE,B)
		if (!p0.new) ATMPL = or(ATMPL,S_ONE)
		ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
	}
	{
		CTMP = neg(ATMP)
		p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
		if (!p1.new) AH = or(AH,AL)
		B = #0
	}
	{
		if (p3) CTMP = ATMP
		USR = AH
		TMP = #-BIAS-(MANTBITS+FUDGE2)
	}
	{
		A = convert_d2df(CTMP)
	}
	{
		AH += asl(TMP,#HI_MANTBITS)
		dealloc_return
	}
.Lpossible_unf:
	{
		TMP = ##0x7fefffff
		ATMP = abs(CTMP)
	}
	{
		p0 = cmp.eq(AL,#0)
		p0 = bitsclr(AH,TMP)
		if (!p0.new) dealloc_return:t
		TMP = #0x7fff
	}
	{
		p0 = bitsset(ATMPH,TMP)
		BH = USR
		BL = #0x0030
	}
	{
		if (p0) BH = or(BH,BL)
	}
	{
		USR = BH
	}
	{
		p0 = dfcmp.eq(A,A)
		dealloc_return
	}
.Lfma_ovf:
	{
		TMP = USR
		CTMP = combine(##0x7fefffff,#-1)
		A = CTMP
	}
	{
		ATMP = combine(##0x7ff00000,#0)
		BH = extractu(TMP,#2,#SR_ROUND_OFF)
		TMP = or(TMP,#0x28)
	}
	{
		USR = TMP
		BH ^= lsr(AH,#31)
		BL = BH
	}
	{
		p0 = !cmp.eq(BL,#1)
		p0 = !cmp.eq(BH,#2)
	}
	{
		p0 = dfcmp.eq(ATMP,ATMP)
		if (p0.new) CTMP = ATMP
	}
	{
		A = insert(CTMP,#63,#0)
		dealloc_return
	}
#undef CTMP
#undef CTMPH
#undef CTMPL
#define BTMP r11:10
#define BTMPH r11
#define BTMPL r10

#undef STICKIES
#undef STICKIESH
#undef STICKIESL
#define C r5:4
#define CH r5
#define CL r4

.Lfma_abnormal_ab:
	{
		ATMP = extractu(A,#63,#0)
		BTMP = extractu(B,#63,#0)
		deallocframe
	}
	{
		p3 = cmp.gtu(ATMP,BTMP)
		if (!p3.new) A = B		// sort values
		if (!p3.new) B = A
	}
	{
		p0 = dfclass(A,#0x0f)		// A NaN?
		if (!p0.new) jump:nt .Lnan
		if (!p3) ATMP = BTMP
		if (!p3) BTMP = ATMP
	}
	{
		p1 = dfclass(A,#0x08)		// A is infinity
		p1 = dfclass(B,#0x0e)		// B is nonzero
	}
	{
		p0 = dfclass(A,#0x08)		// a is inf
		p0 = dfclass(B,#0x01)		// b is zero
	}
	{
		if (p1) jump .Lab_inf
		p2 = dfclass(B,#0x01)
	}
	{
		if (p0) jump .Linvalid
		if (p2) jump .Lab_true_zero
		TMP = ##0x7c000000
	}
	// We are left with a normal or subnormal times a subnormal, A > B
	// If A and B are both very small, we will go to a single sticky bit; replace
	// A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
	// if A and B might multiply to something bigger, decrease A exp and increase B exp
	// and start over
	{
		p0 = bitsclr(AH,TMP)
		if (p0.new) jump:nt .Lfma_ab_tiny
	}
	{
		TMP = add(clb(BTMP),#-EXPBITS)
	}
	{
		BTMP = asl(BTMP,TMP)
	}
	{
		B = insert(BTMP,#63,#0)
		AH -= asl(TMP,#HI_MANTBITS)
	}
	jump .Lfma_begin

.Lfma_ab_tiny:
	ATMP = combine(##0x00100000,#0)
	{
		A = insert(ATMP,#63,#0)
		B = insert(ATMP,#63,#0)
	}
	jump .Lfma_begin

.Lab_inf:
	{
		B = lsr(B,#63)
		p0 = dfclass(C,#0x10)
	}
	{
		A ^= asl(B,#63)
		if (p0) jump .Lnan
	}
	{
		p1 = dfclass(C,#0x08)
		if (p1.new) jump:nt .Lfma_inf_plus_inf
	}
	// A*B is +/- inf, C is finite.  Return A
	{
		jumpr r31
	}
	.falign
.Lfma_inf_plus_inf:
	{	// adding infinities of different signs is invalid
		p0 = dfcmp.eq(A,C)
		if (!p0.new) jump:nt .Linvalid
	}
	{
		jumpr r31
	}

.Lnan:
	{
		p0 = dfclass(B,#0x10)
		p1 = dfclass(C,#0x10)
		if (!p0.new) B = A
		if (!p1.new) C = A
	}
	{	// find sNaNs
		BH = convert_df2sf(B)
		BL = convert_df2sf(C)
	}
	{
		BH = convert_df2sf(A)
		A = #-1
		jumpr r31
	}

.Linvalid:
	{
		TMP = ##0x7f800001		// sp snan
	}
	{
		A = convert_sf2df(TMP)
		jumpr r31
	}

.Lab_true_zero:
	// B is zero, A is finite number
	{
		p0 = dfclass(C,#0x10)
		if (p0.new) jump:nt .Lnan
		if (p0.new) A = C
	}
	{
		p0 = dfcmp.eq(B,C)		// is C also zero?
		AH = lsr(AH,#31)		// get sign
	}
	{
		BH ^= asl(AH,#31)		// form correctly signed zero in B
		if (!p0) A = C			// If C is not zero, return C
		if (!p0) jumpr r31
	}
	// B has correctly signed zero, C is also zero
.Lzero_plus_zero:
	{
		p0 = cmp.eq(B,C)		// yes, scalar equals.  +0++0 or -0+-0
		if (p0.new) jumpr:t r31
		A = B
	}
	{
		TMP = USR
	}
	{
		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
		A = #0
	}
	{
		p0 = cmp.eq(TMP,#2)
		if (p0.new) AH = ##0x80000000
		jumpr r31
	}
#undef BTMP
#undef BTMPH
#undef BTMPL
#define CTMP r11:10
	.falign
.Lfma_abnormal_c:
	// We know that AB is normal * normal
	// C is not normal: zero, subnormal, inf, or NaN.
	{
		p0 = dfclass(C,#0x10)		// is C NaN?
		if (p0.new) jump:nt .Lnan
		if (p0.new) A = C		// move NaN to A
		deallocframe
	}
	{
		p0 = dfclass(C,#0x08)		// is C inf?
		if (p0.new) A = C		// return C
		if (p0.new) jumpr:nt r31
	}
	// zero or subnormal
	// If we have a zero, and we know AB is normal*normal, we can just call normal multiply
	{
		p0 = dfclass(C,#0x01)		// is C zero?
		if (p0.new) jump:nt __hexagon_muldf3
		TMP = #1
	}
	// Left with: subnormal
	// Adjust C and jump back to restart
	{
		allocframe(#STACKSPACE)		// oops, deallocated above, re-allocate frame
		CTMP = #0
		CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
		jump .Lfma_abnormal_c_restart
	}
END(fma)