#***** # # Michael Riff # Multiprecision division version 1.0 Februar 2022. # 96/64 and 128/64 bit divisions implemented as in the CWG # # March 2022 clear renaming of routines and reordering in the files: # div.s: complete algorithms with no value limitation 64:64, 96:64, # 128:64 and 128:16 bits divisions. # div2.s: optimised versions for highest bit of divisor=0: 96:63 and # 128:63 bits divisions. In those cases the 32 and 64 upper bits in the # shifted dividend are always 0 saving a couple of instructions. # Added 128:95 division in which the 32 higher bits of shifted dividend # are always 0. # # April 2024 simplification of the computation of the bit number to shift. # #****/ dialect PowerPC file 'div2.s' # Only the sections can be e+xported (dos not work with labels) # export div96_63[DS] export .div96_63[PR] => 'div_96_63' # export div128e_63[DS] export .div128_63[PR] => 'div_128_63' export .div128_95[PR] => 'div_128_95' # TOC and transition vectors only necessary if cross TOC calls are implemented (separate code fragments) toc div_96_63: tc div96_63[TC], div96_63[DS] div_128_63: tc div128_63[TC], div128_63[DS] div_128_95: tc div128_95[TC], div128_95[DS] linkageArea: equ 24 calleesParams: set 0 calleesLocalVars: set 0 numFPRs: set 0 # For 96 bits dividend and 64 bit divisor but with highest bit not set #------------------------------------------------------------------------- csect div96_63[DS] # This algorithm uses non volatile registers R31, R30 and R13, (R14) (pure algorithm) CTR? # and CR fields 2,3 numGPRs: set 7 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .div96_63[PR] # Code section for div96_63 # Save used non volatile registers stw R30,-4(R1) stw R31,-8(R1) mfctr r31 stw r31,-12(SP) mfcr r30 # stw r30,36(SP) stw r30,4(SP) # Condition register may be saved in linkage area stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # R6 is now free. Load divisor into R6 & R7 lwz r6,0(r4) lwz r7,4(r4) # R4 is now free. Load dividend into R3, R4 R5 lwz r5,8(r3) lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # (R3:R4:R5) = (R3:R4:R5) / (R6:R7) (96b) = (96b / 64b) # quo dvd dvs # # Remainder is returned in R6:R7. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw = least-significant (low-order) word, i.e. bits 64..96 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4:R5 = dvd (input dividend); quo (output quotient) # R6:R7 = dvs (input divisor); rem (output remainder) # # R8:R9:R10 = tmp # Note: after alignment of the most significat digits betzeen dividend and divisor, register # R8 will always be 0. # Algorithm uses non volatile registers R13 to R15 # Save them stw R13,-4(R1) stw R14,-8(R1) # Option use a multiple word store (remplacer R11->R29 .. R15->R25) # stmw R27,0(R1) # count the number of leading 0s in the divisor (R11) cmpwi cr0,R6,0 # dvd.msw == 0? cntlzw R11,R6 # R11 = dvs.msw.LZ cntlzw R12,R7 # R12 = dvs.lsw.LZ bne cr0,lab96_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R11,R12,32 # dvs.LZ = dvs.lsw.LZ + 32 lab96_1: addi r11,r11,32 # Extend divisor to 3 words # count the number of leading 0s in the dividend (R0) cmpwi cr0,R3,0 # dvd.msw == 0? cmpwi cr2,R4,0 # dvd.msw2 == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ bne cr0,lab96_2 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ # Dividend has 32 <= LZ cntlzw R12,R4 # R12 = dvd.lsw.LZ addi R0,R12,32 # dvd.LZ = dvd.lsw.LZ + 32 bne cr2,lab96_2 # if(dvd.msw2 != 0) dvd.LZ = 32+dvd.msw2.LZ # Dividend has 64 <= LZ cntlzw R12,R5 # R9 = dvd.lsw.LZ addi R0,R12,64 # dvd.LZ = dvd.lsw.LZ + 64 lab96_2: # determine shift amounts to minimize the number of iterations # cmpw cr0,R0,R11 # compare dvd.LZ to dvs.LZ # subfic R12,R0,96 # R12 = dvd.SD # bgt cr0,lab96_20 # if(dvs > dvd) quotient = 0 # addi R11,R11,1 # ++dvs.LZ (or --dvs.SD) # subfic R11,R11,96 # R11 = dvs.SD # add R0,R0,R11 # (dvd.LZ + dvs.SD) = left shift of dvd for # initialdvd ## Perform comparisons for comming algor here # cmpwi cr4,R0,32 # compare R0 to 32 # cmpwi cr5,R0,64 # compare R0 to 64 # subf R11,R11,R12 # (dvd.SD - dvs.SD) = right shift of dvd for # # initialtmp # cmpwi cr6,R11,32 # compare R11 to 32 # cmpwi cr7,R11,64 # compare R11 to 64 # mtctr R11 # number of iterations = dvd.SD - dvs.SD # Optimisation cmpw cr0,r0,r11 # compare dvd.LZ to dvs.LZ addi r11,r11,1 # ++dvs.LZ (or --dvs.SD) as shift is # done at beginning of the division loop subf r11,r0,r11 # (dvs.LZ - dvd.LZ) = right shift of dvd for # initialtmp bgt cr0,lab96_20 # if(dvs > dvd) quotient = 0 # Perform comparisons for comming algor here cmpwi cr6,R11,32 # compare R11 to 32 cmpwi cr7,R11,64 # compare R11 to 64 subfic r0,r11,96 # 96-right shift = left shift of dvd for # initial dvd mtctr r11 # number of iterations = dvs.LZ - dvd.LZ +1 cmpwi cr4,R0,32 # compare R0 to 32 cmpwi cr5,R0,64 # compare R0 to 64 # R8:R9:R10 = R3:R4:R5 >> R11 # From here we distinguish 3 cases: # Shift < 32: every register has to be shifted right by R11, and the R11 lower bits of the upper transferred. # 32 <= Shift < 64: the lowest register pushed out. The data to be shifted by R11-32 jumping one register. # otherwise, the 2 lower registers pushed out. Only R11-64 upper bits from the highest register to be put into the lowest. # The right shift of the 3 other registers is dependant on the left shift # Shift left >= 64 bits # Shift left 32 excl. to 64 incl. # Shift left <= 32 bits blt cr6,lab96_4 # if(R11 < 32) jump to lab4 blt cr7,lab96_5 # if(R11 < 64) jump to lab5 b lab96_6 # R11 >= 64 lab96_4: # First case srw R10,R5,R11 # R10 = dvd.lsw >> R11 subfic R12,R11,32 slw R13,R4,R12 # R13 = dvd.msw1 << 32 - R11 or R10,R10,R13 # tmp.lsw = R13 | R10 srw R9,R4,R11 # R9 = dvd.msw1 >> R11 slw R13,R3,R12 # R13 = dvd.msw << 32 - R11 or R9,R9,R13 # tmp.msw1 = R13 | R9 # R8 will be 0 # srw R8,R3,R11 # tmp.msw = dvd.msw >> R11 b lab96_7 lab96_5: # Second case subfic R12,R11,64 # R12 = 64 - R11 addi R11,R11,-32 # R11 = R11 - 32 srw R10,R4,R11 # R10 = dvd.msw1 >> R11 - 32 slw R13,R3,R12 # R13 = dvd.msw << 64 - R11 or R10,R13,R10 # tmp.lsw = R13 | R10 srw R9,R3,R11 # tmp.msw1 = dvd.msv >> R11 - 32 # R8 will be 0 # li R8,0 b lab96_7 lab96_6: # Third case addi R12,R11,-64 srw R10,R3,R12 # R10 = dvd.msw1 >> R11 - 64 # R8 will be 0 # li R8,0 li R9,0 lab96_7: # R3:R4:R5 = R3:R4:R5 << R0 # same 3 cases as before blt cr4,lab96_8 # if(R0 < 32) jump to lab8 blt cr5,lab96_9 # if(R0 < 64) jump to lab9 b lab96_10 # R0 >= 64 lab96_8: # Shift left R3:R4:R5 by less than 32 bits! slw R3,R3,R0 # R3 = dvd.msw << R0 subfic R11,R0,32 srw R12,R4,R11 # R12 = dvd.msw1 >> 32 - R0 or R3,R3,R12 # dvd.msw = R3 | R12 slw R4,R4,R0 # dvd.msw1 = dvd.msw1 << R0 srw R12,R5,R11 # R12 = dvd.lsw >> 32 - R0 or R4,R4,R12 # dvd.msw1 = R4 | R12 slw R5,R5,R0 # dvd.lsw = dvd.lsw << R0 b lab96_11 lab96_9: # Shift left R3:R4:R5 by 32 incl. to 64 excl. bits! subfic R13,R0,64 addic R11,R0,-32 # Not use addi!!! => R11=-32 slw R3,R4,R11 # R3 = dvd.lsw << R0 - 32 srw R12,R5,R13 # R12 = dvd.lsw >> 64 - R0 or R3,R3,R12 # R3 = R3 | R12 slw R4,R5,R11 # R4 = R5 << R0 - 32 li R5,0 b lab96_11 lab96_10: # Shift left R3:R4:R5 by more than 64 bits! addic R13,R0,-64 # Not use addi!!! 0=> R13=-64 slw R3,R5,R13 li R5,0 li R4,0 lab96_11: # restoring division shift and subtract loop li R11,-1 # R11 = -1 addic R7,R7,0 # clear carry bit before loop starts lab96_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde R5,R5,R5 # shift tmp.lsw3 left 1 bit adde R4,R4,R4 # shift tmp.lsw2 left 1 bit adde R3,R3,R3 # shift tmp.lsw1 to left 1 bit adde R10,R10,R10 # shift tmp.msw3 to left 1 bit adde R9,R9,R9 # shift tmp.msw2 to left 1 bit # R8 will be 0 # adde R8,R8,R8 # shift tmp.msw1 to left 1 bit subfc R0,R7,R10 # tmp.lsw - dvs.lsw subfe. R12,R6,R9 # tmp.msw - dvs.msw blt cr0,lab96_13 # if(result< 0) clear carry bit mr R10,R0 # move lsw mr R9,R12 # move msw addic R0,R11,1 # set carry bit lab96_13: bdnz lab96_12 # write quotient and remainder adde R5,R5,R5 # quo.lsw (lsb = CA) adde R4,R4,R4 # quo.msw1 (lsb = CA) adde R3,R3,R3 # quo.msw (lsb from lsw) mr R7,R10 # rem.lsw mr R6,R9 # rem.msw #blr # return b finish_96 lab96_20: # Quotient is 0 (dvs > dvd) mr R7,R5 # rmd.lsw = dvd.lsw mr R6,R4 # rmd.msw = dvd.msw1 li R5,0 # dvd.msw = 0 li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 #blr# return finish_96: # Restore non volatile registers lwz R13,-4(R1) lwz R14,-8(R1) # Option use a multiple word store (remplacer R11->R31 .. R15->R27 # lmw R27,0(R1) # Write results back into memory stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) stw r6,0(r31) stw r7,4(r31) # Restore non volatile registers # lwz SP,0(SP) lwz r31,spaceToSaveAligned+4(SP) mtcrf 2,r31 mtcrf 4,r31 lwz r30,spaceToSaveAligned-12(SP) mtctr r30 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr # For 128 bits dividend and 64 bit divisor but with highest bit not set #------------------------------------------------------------------------ csect div128_63[DS] # This algorithm uses non volatile registers R31, R30 and R29, R28, R27 (pure algorithm) # and CR fields 2,3 numGPRs: set 6 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .div128_63[PR] # Code section for div128_63 # Save used non volatile registers stw R30,-4(R1) stw R31,-8(R1) mfcr r30 stw r30,4(SP) # Condition register may be saved in linkage area stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # Load divisor into R7 & R8 lwz r7,0(r4) lwz r8,4(r4) # R6 & R4 are now free. Load dividend into R3, R4 R5 R6 lwz r6,12(r3) lwz r5,8(r3) lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # (R3:R4:R5:R6) = (R3:R4:R5:R6) / (R7:R8) (128b) = (128b / 64b) # quo dvd dvs # # Remainder is returned in R7:R8. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw1 = least-significant (low-order) word, i.e. bits 64..96 # lsw = least-significant (low-order) word, i.e. bits 97..128 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4:R5:R6 = dvd (input dividend); quo (output quotient) # R7:R8 = dvs (input divisor); rem (output remainder) # # R9:R10:R11:R12 = tmp # Note: after alignment of the most significat digits betzeen dividend and divisor, register # R9 and R10 will always be 0. # Algorithm uses non volatile registers R27 to R31 # Save them stw R27,-4(R1) stw R28,-8(R1) stw R29,-12(R1) # stw R30,-16(R1) done before # stw R31,-18(R1) done before # count the number of leading 0s in the divisor (R29) cmpwi cr0,R7,0 # dvd.msw == 0? cntlzw R29,R7 # R29 = dvs.msw.LZ cntlzw R12,R8 # R12 = dvs.lsw.LZ bne cr0,lab128_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R29,R12,32 # dvs.LZ = dvs.lsw.LZ + 32 lab128_1: addi r29,r29,64 # Extend divisor to 4 words # count the number of leading 0s in the dividend (R0) cmpwi cr0,R3,0 # dvd.msw == 0? cmpwi cr2,R4,0 # dvd.msw2 == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ bne cr0,lab128_2 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ # Dividend has 32 <= LZ cmpwi cr3,R5,0 # dvd.lsw1 == 0? cntlzw R12,R4 # R12 = dvd.lsw.LZ addi R0,R12,32 # dvd.LZ = dvd.lsw.LZ + 32 bne cr2,lab128_2 # if(dvd.msw2 != 0) dvd.LZ = 32+dvd.msw2.LZ # Dividend has 64 <= LZ cntlzw R12,R5 # R12 = dvd.lsw.LZ addi R0,R12,64 # dvd.LZ = dvd.lsw.LZ + 64 bne cr3,lab128_2 # if(dvd.lsw1 != 0) dvd.LZ = 64+dvd.lsw1.LZ # Dividend has LZ >= 96 cntlzw R12,R6 # R12 = dvd.lsw.LZ addi R0,R12,96 # dvd.LZ = dvd.lsw.LZ + 96 lab128_2: # determine shift amounts to minimize the number of iterations # cmpw cr0,R0,R29 # compare dvd.LZ to dvs.LZ # subfic R10,R0,128 # R10 = dvd.SD # bgt cr0,lab128_14 # if(dvs > dvd) quotient = 0 # addi R29,R29,1 # ++dvs.LZ (or --dvs.SD) # subfic R29,R29,128 # R9 = dvs.SD # add R0,R0,R29 # (dvd.LZ + dvs.SD) = left shift of dvd for # # initialdvd # subf R29,R29,R10 # (dvd.SD - dvs.SD) = right shift of dvd for # # initialtmp # mtctr R29 # number of iterations = dvd.SD - dvs.SD # Optimisation cmpw cr0,r0,r29 # compare dvd.LZ to dvs.LZ addi r29,r29,1 # ++dvs.LZ (or --dvs.SD) as shift is # done at beginning of the division loop subf r29,r0,r29 # (dvs.LZ - dvd.LZ +1) = right shift of dvd for # initialtmp bgt cr0,lab128_14 # if(dvs > dvd) quotient = 0 subfic r0,r29,128 # 128-right shift = left shift of dvd for # initial dvd mtctr r29 # number of iterations = dvs.LZ - dvd.LZ +1 # R9:R10:R11:R12 = R3:R4:R5:R6 >> r29 # Shifting right 4 registers wxyz by r29 #--------------------------------------- cmpwi cr7,r29,96 # compare r29 to 96 cmpwi cr6,r29,64 # compare r29 to 64 cmpwi cr5,r29,32 # compare r29 to 32 bge cr7,labSR3 bge cr6,labSR2 bge cr5,labSR1 # Shift < 32 bits srw r12,r6,r29 subfic r28,r29,32 # r28 = 32 - r29 slw r27,r5,r28 or r12,r27,r12 srw r11,r5,r29 slw r27,r4,r28 or r11,r27,r11 # R9 and R10 will be 0 # srw r10,r4,r29 # slw r27,r3,r28 # or r10,r27,r10 # srw r9,r3,r29 b lab128_7 labSR1: # 32 <= Shift < 64 subfic r28,r29,64 # r28 = 64 - r29 addi r29,r29,-32 # r29 = r29 - 32 srw r12,r5,r29 slw r27,r4,r28 or r12,r27,r12 srw r11,r4,r29 slw r27,r3,r28 or r11,r27,r11 # R9 and R10 will be 0 # srw r10,r3,r29 # li r9,0 b lab128_7 labSR2: # 64 <= Shift < 96 subfic r28,r29,96 # r28 = 96 - r29 addi r29,r29,-64 # r29 = r29 - 64 srw r12,r4,r29 slw r27,r3,r28 or r12,r27,r12 srw r11,r3,r29 # R9 and R10 will be 0 # li R10,0 # li R9,0 b lab128_7 labSR3: # 96 <= Shift addi r29,r29,-96 # r29 = r29 - 96 srw r12,r3,r29 li r11,0 # R9 and R10 will be 0 # li r10,0 # li r9,0 lab128_7: # R3:R4:R5:R6 = R3:R4:R5:R6 << R0 # Shifting left 4 registers wxyz by R0 #--------------------------------------- cmpwi cr7,R0,96 # compare R0 to 96 cmpwi cr6,R0,64 # compare R0 to 64 cmpwi cr5,R0,32 # compare R0 to 32 bge cr7,labSL3 bge cr6,labSL2 bge cr5,labSL1 # Shift < 32 bits slw r3,r3,r0 subfic r28,r0,32 # r28 = 32 - r0 srw r27,r4,r28 or r3,r3,r27 slw r4,r4,r0 srw r27,r5,r28 or r4,r4,r27 slw r5,r5,r0 srw r27,r6,r28 or r5,r5,r27 slw r6,r6,r0 b lab128_11 labSL1: # 32 <= Shift < 64 subfic r28,r0,64 # r28 = 64 - r0 addic r0,r0,-32 # r0 = r0 - 32 Not use addi!!! 0=> r0=-64 slw r3,r4,r0 srw r27,r5,r28 or r3,r3,r27 slw r4,r5,r0 srw r27,r6,r28 or r4,r4,r27 slw r5,r6,r0 li r6,0 b lab128_11 labSL2: # 64 <= Shift < 96 subfic r28,r0,96 # r28 = 96 - r0 addic r0,r0,-64 # r0 = r0 - 64 Not use addi!!! 0=> r0=-64 slw r3,r5,R0 srw r27,r6,r28 or r3,r3,r27 slw r4,r6,R0 li r5,0 li r6,0 b lab128_11 labSL3: # 96 <= Shift addic r0,r0,-96 # r0 = r0 - 96 Not use addi!!! 0=> r0=-96 slw r3,r6,r0 li R4,0 li R5,0 li R6,0 lab128_11: # restoring division shift and subtract loop li R29,-1 # R29 = -1 addic R7,R7,0 # clear carry bit before loop starts lab128_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde R6,R6,R6 # shift dvd.lsw4 left 1 bit adde R5,R5,R5 # shift dvd.lsw3 left 1 bit adde R4,R4,R4 # shift dvd.lsw2 left 1 bit adde R3,R3,R3 # shift dvd.lsw1 to left 1 bit adde R12,R12,R12 # shift tmp.msw4 to left 1 bit adde R11,R11,R11 # shift tmp.msw3 to left 1 bit # R9 and R10 will be 0 # adde R10,R10,R10 # shift tmp.msw2 to left 1 bit # adde R9,R9,R9 # shift tmp.msw1 to left 1 bit subfc R0,R8,R12 # tmp.lsw - dvs.lsw subfe. R28,R7,R11 # tmp.msw - dvs.msw blt cr0,lab128_13 # if(result< 0) clear carry bit mr R12,R0 # move lsw mr R11,R28 # move msw addic R0,R29,1 # set carry bit lab128_13: bdnz lab128_12 # write quotient and remainder adde R6,R6,R6 # quo.lsw (lsb = CA) adde R5,R5,R5 # quo.sw_l (lsb = CA) adde R4,R4,R4 # quo.sw_u (lsb = CA) adde R3,R3,R3 # quo.msw (lsb from lsw) # mr R8,R12 # rem.lsw # mr R7,R11 # rem.msw b finish_128 #blr # return lab128_14: # Quotient is 0 (dvs > dvd) mr R12,R6 # rmd.lsw mr R11,R5 # rmd.msw li R6,0 # dvd.msw = 0 li R5,0 # dvd.msw = 0 li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 finish_128: # Write results back into memory stw r11,0(r31) stw r12,4(r31) stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) stw r6,12(r30) # Restore non vol registers lwz R27,-4(R1) lwz R28,-8(R1) lwz R29,-12(R1) # lwz R30,-16(R1) # lwz R31,-18(R1) # Restore non volatile registers lwz r31,spaceToSaveAligned+4(SP) mtcrf 2,r31 mtcrf 3,r31 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr # For 128 bits dividend and 96 bits divisor but with highest bit not set #------------------------------------------------------------------------ csect div128_95[DS] # This algorithm uses non volatile registers R31, R30 and R29, R28, R27 (pure algorithm) # and CR fields 2,3 numGPRs: set 6 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .div128_95[PR] # Code section for div128 # Save used non volatile registers stw R30,-4(r1) stw R31,-8(r1) mfcr r30 stw r30,4(SP) # Condition register may be saved in linkage area # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # Load divisor into R7, R8 & R lwz r7,0(r4) lwz r8,4(r4) lwz r9,8(r4) # R6 & R4 are now free. Load dividend into R3, R4 R5 R6 lwz r6,12(r3) lwz r5,8(r3) lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # (R3:R4:R5:R6) = (R3:R4:R5:R6) / (R7:R8:R9) (128b) = (128b / 96b) # quo dvd dvs # # Remainder is returned in R10:R11:R12. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw1 = least-significant (low-order) word, i.e. bits 64..96 # lsw = least-significant (low-order) word, i.e. bits 97..128 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4:R5:R6 = dvd (input dividend); quo (output quotient) # R7:R8:R9 = dvs (input divisor); rem (output remainder) # # R9:R10:R11:R12 = tmp # Note: after alignment of the most significant digits between dividend and divisor, register # R9 and R10 will always be 0. # Algorithm uses non volatile registers R27 to R31 # Save them stw r27,-12(r1) stw r28,-16(r1) stw r29,-20(r1) # count the number of leading 0s in the divisor (R29) cmpwi cr5,r7,0 # dvd.msw == 0? cmpwi cr6,r8,0 # dvd.msw2 == 0? cntlzw r29,r7 # R29 = dvs.msw.LZ bne cr5,lab128_2_2_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ cntlzw r12,r8 # R12 = dvs.lsw.LZ addi r29,r12,32 # dvs.LZ = dvs.lsw.LZ + 32 bne cr6,lab128_2_2_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ cntlzw r12,r9 # R12 = dvs.lsw.LZ addi r29,r12,64 # dvs.LZ = dvs.lsw.LZ + 64 lab128_2_2_1: addi r29,r29,32 # Extend divisor to 4 words # count the number of leading 0s in the dividend (R0) cmpwi cr0,r3,0 # dvd.msw == 0? cmpwi cr2,r4,0 # dvd.msw2 == 0? cntlzw r0,r3 # R0 = dvd.msw.LZ bne cr0,lab128_2_2 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ # Dividend has 32 <= LZ cmpwi cr3,r5,0 # dvd.lsw1 == 0? cntlzw r12,r4 # R12 = dvd.lsw.LZ addi r0,r12,32 # dvd.LZ = dvd.lsw.LZ + 32 bne cr2,lab128_2_2 # if(dvd.msw2 != 0) dvd.LZ = 32+dvd.msw2.LZ # Dividend has 64 <= LZ cntlzw r12,R5 # R12 = dvd.lsw.LZ addi r0,r12,64 # dvd.LZ = dvd.lsw.LZ + 64 bne cr3,lab128_2_2 # if(dvd.lsw1 != 0) dvd.LZ = 64+dvd.lsw1.LZ # Dividend has LZ >= 96 cntlzw r12,R6 # R12 = dvd.lsw.LZ addi r0,r12,96 # dvd.LZ = dvd.lsw.LZ + 96 lab128_2_2: # determine shift amounts to minimize the number of iterations # cmpw cr0,r0,r29 # compare dvd.LZ to dvs.LZ # subfic r10,r0,128 # R10 = dvd.SD # bgt cr0,lab128_2_14 # if(dvs > dvd) quotient = 0 # addi r29,r29,1 # ++dvs.LZ (or --dvs.SD) # subfic r29,r29,128 # R9 = dvs.SD # add r0,r0,r29 # (dvd.LZ + dvs.SD) = left shift of dvd for # # initialdvd # subf r29,r29,r10 # (dvd.SD - dvs.SD) = right shift of dvd for # # initialtmp # mtctr r29 # number of iterations = dvd.SD - dvs.SD # Optimisation cmpw cr0,r0,r29 # compare dvd.LZ to dvs.LZ addi r29,r29,1 # ++dvs.LZ (or --dvs.SD) as shift is # done at beginning of the division loop bgt cr0,lab128_2_14 # if(dvs > dvd) quotient = 0 subf r29,r0,r29 # (dvs.LZ - dvd.LZ +1) = right shift of dvd for # initialtmp subfic r0,r29,128 # 128-right shift = left shift of dvd for # initial dvd mtctr r29 # number of iterations = dvs.LZ - dvd.LZ +1 # R9:R10:R11:R12 = R3:R4:R5:R6 >> r29 # R9 will be 0 and is then available for other use! # Shifting right 4 registers wxyz by r29 #--------------------------------------- cmpwi cr7,r29,96 # compare r29 to 96 cmpwi cr6,r29,64 # compare r29 to 64 cmpwi cr5,r29,32 # compare r29 to 32 bge cr7,labSR3_2 bge cr6,labSR2_2 bge cr5,labSR1_2 # Shift < 32 bits srw r12,r6,r29 subfic r28,r29,32 # r28 = 32 - r29 slw r27,r5,r28 or r12,r27,r12 srw r11,r5,r29 slw r27,r4,r28 or r11,r27,r11 srw r10,r4,r29 slw r27,r3,r28 or r10,r27,r10 # R9 will be 0 # srw r9,r3,r29 b lab128_2_7 labSR1_2: # 32 <= Shift < 64 subfic r28,r29,64 # r28 = 64 - r29 addi r29,r29,-32 # r29 = r29 - 32 srw r12,r5,r29 slw r27,r4,r28 or r12,r27,r12 srw r11,r4,r29 slw r27,r3,r28 or r11,r27,r11 srw r10,r3,r29 # R9 will be 0 # li r9,0 b lab128_2_7 labSR2_2: # 64 <= Shift < 96 subfic r28,r29,96 # r28 = 96 - r29 addi r29,r29,-64 # r29 = r29 - 64 srw r12,r4,r29 slw r27,r3,r28 or r12,r27,r12 srw r11,r3,r29 # R9 and R10 will be 0 li R10,0 # li R9,0 b lab128_2_7 labSR3_2: # 96 <= Shift addi r29,r29,-96 # r29 = r29 - 96 srw r12,r3,r29 li r11,0 # R9 and R10 will be 0 li r10,0 # li r9,0 lab128_2_7: # R3:R4:R5:R6 = R3:R4:R5:R6 << R0 # Shifting left 4 registers wxyz by R0 #--------------------------------------- cmpwi cr7,R0,96 # compare R0 to 96 cmpwi cr6,R0,64 # compare R0 to 64 cmpwi cr5,R0,32 # compare R0 to 32 bge cr7,labSL3_2 bge cr6,labSL2_2 bge cr5,labSL1_2 # Shift < 32 bits slw r3,r3,r0 subfic r28,r0,32 # r28 = 32 - r0 srw r27,r4,r28 or r3,r3,r27 slw r4,r4,r0 srw r27,r5,r28 or r4,r4,r27 slw r5,r5,r0 srw r27,r6,r28 or r5,r5,r27 slw r6,r6,r0 b lab128_2_11 labSL1_2: # 32 <= Shift < 64 subfic r28,r0,64 # r28 = 64 - r0 addic r0,r0,-32 # r0 = r0 - 32 Not use addi!!! 0=> r0=-64 slw r3,r4,r0 srw r27,r5,r28 or r3,r3,r27 slw r4,r5,r0 srw r27,r6,r28 or r4,r4,r27 slw r5,r6,r0 li r6,0 b lab128_2_11 labSL2_2: # 64 <= Shift < 96 subfic r28,r0,96 # r28 = 96 - r0 addic r0,r0,-64 # r0 = r0 - 64 Not use addi!!! 0=> r0=-64 slw r3,r5,R0 srw r27,r6,r28 or r3,r3,r27 slw r4,r6,R0 li r5,0 li r6,0 b lab128_2_11 labSL3_2: # 96 <= Shift addic r0,r0,-96 # r0 = r0 - 96 Not use addi!!! 0=> r0=-96 slw r3,r6,r0 li R4,0 li R5,0 li R6,0 lab128_2_11: # restoring division shift and subtract loop li r29,-1 # R29 = -1 addic r7,r7,0 # clear carry bit before loop starts lab128_2_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde r6,r6,r6 # shift dvd.lsw4 left 1 bit adde r5,r5,r5 # shift dvd.lsw3 left 1 bit adde r4,r4,r4 # shift dvd.lsw2 left 1 bit adde r3,r3,r3 # shift dvd.lsw1 to left 1 bit adde r12,r12,r12 # shift tmp.msw4 to left 1 bit adde r11,r11,r11 # shift tmp.msw3 to left 1 bit # R9 will be 0, and is used for the divisor! adde r10,r10,r10 # shift tmp.middle_u to left 1 bit # adde r9,r9,r9 # shift tmp.high to left 1 bit subfc r0,r9,r12 # tmp.lsw - dvs.lsw subfe r28,r8,r11 # tmp. - dvs. subfe. r27,r7,r10 # tmp.msw - dvs.msw blt cr0,lab128_2_13 # if(result< 0) clear carry bit mr r12,r0 # move lsw mr r11,r28 # move msw mr r10,r27 addic r0,r29,1 # set carry bit lab128_2_13: bdnz lab128_2_12 # write quotient and remainder adde r6,r6,r6 # quo.lsw (lsb = CA) adde R5,R5,R5 # quo.sw_l (lsb = CA) adde R4,R4,R4 # quo.sw_u (lsb = CA) adde R3,R3,R3 # quo.msw (lsb from lsw) # mr R8,R12 # rem.lsw # mr R7,R11 # rem.msw b finish_128_2 #blr # return lab128_2_14: # Quotient is 0 (dvs > dvd) mr R12,R6 # rmd.lsw mr R11,R5 # rmd.msw mr R10,R4 li R6,0 # dvd.msw = 0 li R5,0 # dvd.msw = 0 li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 finish_128_2: # Write results back into memory stw r10,0(r31) stw r11,4(r31) stw r12,8(r31) stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) stw r6,12(r30) # Restore non vol registers lwz R27,-12(r1) lwz R28,-16(r1) lwz R29,-20(r1) # Restore non volatile registers lwz r31,4(SP) mtcrf 2,r31 mtcrf 3,r31 lwz r30,-4(r1) lwz r31,-8(r1) blr end