# Michael Riff # Assembler rounding software version 1.0 March 2024. # Actual library. The rounding routines return directly and integer type result # instead of a floating point value. # For each rounding, 2 routines are defined: 1 checnging the settings in register # FPSCR and using instruction fctiw, the other not touching the resgister and # using instruction fctiwz. # At the end a routine reading out t rouding slection from FPSCR, and a routine # changing FPSCR to round to nearrest and converting an array of values. dialect PowerPC file 'floor.s' # Only the sections can be exported (dos not work with labels) # export my_floor1[DS] export .my_floor1[PR] => 'floor1_s' # export my_floor2[DS] export .my_floor2[PR] => 'floor2_s' # export my_ceil1[DS] export .my_ceil1[PR] => 'ceil1_s' # export my_ceil2[DS] export .my_ceil2[PR] => 'ceil2_s' # export my_round1[DS] export .my_round1[PR] => 'round1_s' # export my_round2[DS] export .my_round2[PR] => 'round2_s' # export my_test[DS] export .my_test[PR] => 'test_s' # export rndsel[DS] export .rndsel[PR] => 'roundSetting' # export rndarray[DS] export .rndarray[PR] => 'roundArray' Flt_Cst: equ 1 # TOC and transition vectors only necessary if cross TOC calls are implemented (separate code fragments) toc start1: tc my_floor1[TC], my_floor1[DS] start2: tc my_floor2[TC], my_floor2[DS] IFDEF Flt_Cst Csts: tc my_const[TC], my_const[RW] ENDIF linkageArea: equ 24 calleesParams: set 0 calleesLocalVars: set 0 numFPRs: set 0 ########################## # FLOOR # Floor with change of FPSCR lower bits csect my_floor1[DS] dc.l .my_floor1[PR] dc.l TOC[tc0] csect .my_floor1[PR] my_floor1: # Get 64 bits aligned address and 8 bytes on the stack into r12 addi r12, r1, -15 rlwinm r12, r12, 0, 0, 28 # Save FPSCR at r12 address mffs fp13 stfd fp13, 0(r12) # If lower 2 bits already 11 round directly lwz r11, 4(r12) # stwx r11, 8(r12) # mr r10, r11 rlwinm r10, r11, 0, 30, 31 cmpi cr7,1,r10,3 bc 12,30, Round1 # beq Round1 # Set lower 2 bits of FPSCR (round towards -INF) ori r11,r11,3 stw r11, 4(r12) lfd fp12, 0(r12) mtfsf 255, fp12 fctiw fp11, fp1 stfd fp11, -8(r12) lwz r3, -4(r12) # Restore FPSCR # ldwx r10, 8(r12) # stwx r10, 4(r12) # lfd fp13, 0(r12) mtfsf 255, fp13 blr Round1: fctiw fp11, fp1 stfd fp11, -8(r12) lwz r3, -4(r12) blr # Floor without change of FPSCR lower bits csect my_floor2[DS] dc.l .my_floor2[PR] dc.l TOC[tc0] IFDEF Flt_Cst csect my_const[RW] ; [RO] does not work! Zero: dc.d "0.000000" Min: dc.d "-1.000000" Pls: dc.d "1.000000" ENDIF csect .my_floor2[PR] my_floor2: # Load 1/-1 and 0 into a Fpr IFDEF Flt_Cst lwz r10, my_const[TC](r2) # r10 points to constant 0.0 lfd fp12, Zero(r10) # fp12 = 0 ENDIF # Get 64 bits aligned address and 16 bytes on the stack into r11 addi r11, r1, -23 rlwinm r11, r11, 0, 0, 28 IFNDEF Flt_Cst # Set fp12 to 0 using integer registers and memory # addi r12, r0, 0 # r12 = 0 xor r12, r12, r12 # r12 = 0 stw r12, 0(r11) stw r12, 4(r11) lfd fp12, 0(r11) # = 0 ENDIF # Perform rounding fctiwz fp10, fp1 stfd fp10, 0(r11) # r11 points to rounded value # If rounded argument != argument and argument <0 subtract 1 # Check if argument is negative fcmpo cr6, fp1, fp12 bc 4,24,Ret1 # bge # Argument is negative. Check if the fractional part is nonzero addi r12,r11,8 # r12 points to original value stfd fp1, 0(r12) lwz r10, 0(r12) rlwinm r9,r10,12,21,31 addi r9, r9, -1023 # = exp # Handling of exponents <=-1 (-1,0 < float values < 0,0) # The result is always -1 cmpi cr7,0,r9,0 bc 4,28,PositExp1 xor r8, r8, r8 not r3, r8 blr PositExp1: cmpi cr5,0,r9,20 bc 4,20,LowPart1 # bge cr5 subfic r8,r9, 20 addi r7, r0,1 slw r7, r7, r8 addi r7, r7, -1 # mask for 20-exp lower bits and. r10,r10,r7 bc 4,2,Correct1 # bne: fractional part nonzero # Process all 32 bits of the lower part lwz r10, 4(r12) cmpi cr7,0,r10,0 bc 4,30,Correct1 # bne cr7 b Ret1 LowPart1: lwz r10, 4(r12) # Case were the complete 32 bits must be checked bc 12,21,Partly1 # bgt cr5 cmpi cr7, r10, 0 bc 12, 30, Ret1 # beq cr7 b Correct1 Partly1: subfic r8,r9, 52 # r8 < 32 addi r7, r0, 1 slw r7, r7, r8 addi r7, r7, -1 # mask for 52-exp lower bits and. r10,r10,r7 bc 12,2,Ret1 # beq: fractional part zero # Fractional part is nonzero, subtract 1 from rounded result Correct1: lwz r3, 4(r11) addi r3, r3, -1 blr Ret1: lwz r3, 4(r11) blr ########################## # CEIL # Ceil with change of FPSCR lower bits csect my_ceil1[DS] dc.l .my_ceil1[PR] dc.l TOC[tc0] csect .my_ceil1[PR] my_ceil1: # Get 64 bits aligned address and 8 bytes on the stack into r12 addi r12, r1, -15 rlwinm r12, r12, 0, 0, 28 # Save FPSCR at r12 address mffs fp13 stfd fp13, 0(r12) # If lower 2 bits already 11 round directly lwz r11, 4(r12) # stwx r11, 8(r12) # mr r10, r11 rlwinm r10, r11, 0, 30, 31 cmpi cr7,1,r10,2 bc 12,30, Round2 # beq Round2 # Set lower 2 bits of FPSCR to 10 (round towards +INF) addi r9,0,3 andc r11,r11,r9 ori r11,r11,2 stw r11, 4(r12) lfd fp12, 0(r12) mtfsf 255, fp12 fctiw fp11, fp1 stfd fp11, -8(r12) lwz r3, -4(r12) # Restore FPSCR # ldwx r10, 8(r12) # stwx r10, 4(r12) # lfd fp13, 0(r12) mtfsf 255, fp13 blr Round2: fctiw fp11, fp1 stfd fp11, -8(r12) lwz r3, -4(r12) blr # Ceil without change of FPSCR lower bits csect my_ceil2[DS] dc.l .my_ceil2[PR] dc.l TOC[tc0] csect .my_ceil2[PR] my_ceil2: # Load 1/-1 and 0 into a Fpr IFDEF Flt_Cst lwz r10, my_const[TC](r2) # r10 points to constant 0.0 lfd fp12, Zero(r10) # fp12 = 0 ENDIF # Get 64 bits aligned address and 16 bytes on the stack into r11 addi r11, r1, -23 rlwinm r11, r11, 0, 0, 28 IFNDEF Flt_Cst # Set fp12 to 0 using integer registers and memory # addi r12, r0, 0 # r12 = 0 xor r12, r12, r12 # r12 = 0 stw r12, 0(r11) stw r12, 4(r11) lfd fp12, 0(r11) # = 0 ENDIF # Perform rounding fctiwz fp10, fp1 stfd fp10, 0(r11) # r11 points to rounded value # If rounded argument != argument and argument >0 add 1 # Check if argument is positive fcmpo cr6, fp1, fp12 bc 4,25,Ret2 # ble # Argument is positive and not zero. Check if the fractional part is nonzero addi r12,r11,8 # r12 points to original value stfd fp1, 0(r12) lwz r10, 0(r12) rlwinm r9,r10,12,21,31 addi r9, r9, -1023 # = exp # Handling of exponents <=-1 (0,0 <= float values < 1,0) # The result is always +1 cmpi cr7,0,r9,0 bc 4,28,PositExp2 addi r3, r0,1 blr PositExp2: cmpi cr5,0,r9,20 bc 4,20,LowPart2 # bge cr5 subfic r8,r9, 20 addi r7, r0,1 slw r7, r7, r8 addi r7, r7, -1 # mask for 20-exp lower bits and. r10,r10,r7 bc 4,2,Correct2 # bne: fractional part nonzero # Process all 32 bits of the lower part lwz r10, 4(r12) cmpi cr7,0,r10,0 bc 4,30,Correct2 # bne cr7 b Ret2 LowPart2: lwz r10, 4(r12) # Case were the complete 32 bits must be checked bc 12,21,Partly2 # bgt cr5 cmpi cr7, r10, 0 bc 12, 30, Ret2 # beq cr7 b Correct2 Partly2: subfic r8,r9, 52 # r8 < 32 addi r7, r0, 1 slw r7, r7, r8 addi r7, r7, -1 # mask for 52-exp lower bits and. r10,r10,r7 bc 12,2,Ret2 # beq: fractional part zero # Fractional part is nonzero, subtract 1 from rounded result Correct2: lwz r3, 4(r11) addi r3, r3, 1 blr Ret2: lwz r3, 4(r11) blr ########################## # ROUND TO NEAREST # Round with change of FPSCR lower bits csect my_round1[DS] dc.l .my_round1[PR] dc.l TOC[tc0] csect .my_round1[PR] my_round1: # Get 64 bits aligned address and 8 bytes on the stack into r12 addi r12, r1, -15 rlwinm r12, r12, 0, 0, 28 # Save FPSCR at r12 address mffs fp13 stfd fp13, 0(r12) # If lower 2 bits already 11 round directly lwz r11, 4(r12) # stwx r11, 8(r12) # mr r10, r11 rlwinm r10, r11, 0, 30, 31 cmpi cr7,1,r10,0 bc 12,30, Round3 # beq Round2 # Set lower 2 bits of FPSCR to 00 (round to nearest) addi r9,0,3 andc r11,r11,r9 stw r11, 4(r12) lfd fp12, 0(r12) mtfsf 255, fp12 fctiw fp11, fp1 stfd fp11, -8(r12) lwz r3, -4(r12) # Restore FPSCR # ldwx r10, 8(r12) # stwx r10, 4(r12) # lfd fp13, 0(r12) mtfsf 255, fp13 blr Round3: fctiw fp11, fp1 stfd fp11, -8(r12) lwz r3, -4(r12) blr # Round without change of FPSCR lower bits # Note : to make things simpler, 0,5 is rounded towards higher mantissa (away from zero) # integer value(negative -> down and positive -> up) csect my_round2[DS] dc.l .my_round2[PR] dc.l TOC[tc0] csect .my_round2[PR] my_round: # Load 1/-1 and 0 into a Fpr IFDEF Flt_Cst lwz r10, my_const[TC](r2) # r10 points to constant 0.0 lfd fp12, Zero(r10) # fp12 = 0 ENDIF # Get 64 bits aligned address and 16 bytes on the stack into r11 addi r11, r1, -23 rlwinm r11, r11, 0, 0, 28 IFNDEF Flt_Cst # Set fp12 to 0 using integer registers and memory # addi r12, r0, 0 # r12 = 0 xor r12, r12, r12 # r12 = 0 stw r12, 0(r11) stw r12, 4(r11) lfd fp12, 0(r11) # = 0 ENDIF # Perform rounding fctiwz fp10, fp1 stfd fp10, 0(r11) # r11 points to rounded value # Check if the first bit of the fractional part is 1 addi r12,r11,8 # r12 points to original value stfd fp1, 0(r12) lwz r10, 0(r12) rlwinm r9,r10,12,21,31 addi r9, r9, -1023 # = exp # Handling of exponents <=-1 (1,0 < float values < 1,0) cmpi cr7,0,r9,0 bc 4,28,PositExp3 # If exp = -1 rounded val is +-1 else is 0 addi r8,r9, 2 # <= 1 srawi r7,r8,31 # if <0 = 0xFFFFFFF other =0 andc r6,r8,r7 # is now 0 or 1 (if exp <-1 resp =-1) # Add sign of argument srawi r9,r10,31 # -1 if arg <0 0 otherwise slwi r10,r9,1 addi r10,r10,1 mullw r3,r10,r6 blr PositExp3: cmpi cr5,0,r9,20 bc 4,20,LowPart3 # bge cr5 subfic r8,r9, 19 addi r7, r0,1 slw r7, r7, r8 # mask for bit 20-exp-1 (start with 0) and. r10,r10,r7 bc 4,2,Correct3 # bne: fractional first bit nonzero b Ret3 LowPart3: # Exponent is >=20, first bit of fractional part is in lower 32 bits lwz r10, 4(r12) subfic r8,r9, 51 # r8 < 32 addi r7, r0, 1 slw r7, r7, r8 # mask for bit 52-exp (start with 0) and. r10,r10,r7 bc 12,2,Ret3 Correct3: # Fractional part>0,5. If argument positive, add 1, if negative subtract 1 lwz r3, 4(r11) # Check if argument is positive (or 0) fcmpo cr6, fp1, fp12 bc 4,24,Correct4 # bge cr6 # Argument is negative addi r3, r3, -1 blr Correct4: addi r3, r3, 1 blr Ret3: lwz r3, 4(r11) blr ########################## # TEST ROUTINES # Routine returning the rounding selection of FPSCR csect rndsel[DS] dc.l .rndsel[PR] dc.l TOC[tc0] csect .rndsel[PR] rndsel: # Get 64 bits aligned address and 8 bytes on the stack into r12 addi r12, r1, -15 rlwinm r12, r12, 0, 0, 28 # Save FPSCR at r12 address mffs fp13 stfd fp13, 0(r12) # If lower 2 bits already 11 round directly lwz r11, 4(r12) # stwx r11, 8(r12) # mr r10, r11 rlwinm r3, r11, 0, 30, 31 blr # Routine performing the rounding of a set of floating values csect rndarray[DS] dc.l .rndarray[PR] dc.l TOC[tc0] csect .rndarray[PR] rndarray: # Get 64 bits aligned address and 8 bytes on the stack into r12 addi r12, r1, -15 rlwinm r12, r12, 0, 0, 28 # Save FPSCR at r12 address mffs fp13 stfd fp13, 0(r12) # Set lower 2 bits of FPSCR to 00 (round to nearest) lwz r11, 4(r12) addi r9,0,3 andc r11,r11,r9 stw r11, 4(r12) lfd fp12, 0(r12) mtfsf 255, fp12 # Convert array mtctr r3 xor r9,r9,r9 xor r10,r10,r10 loop: lfdx fp10,r9,r4 fctiw fp9,fp10 stfd fp9,-8(r12) lwz r8,-4(r12) stwx r8,r10,r5 addi r9,r9,8 addi r10,r10,4 bc 17,0,loop # Dec ctr, branch if /=0 # Restore FPSCR # ldwx r10, 8(r12) # stwx r10, 4(r12) # lfd fp13, 0(r12) mtfsf 255, fp13 blr End