#***** # # Michael Riff # Multiprecision division version 1.0 Februar 2022. # 64 bits division implemented in 2 versions: # first version as in the PowerPC Compiler Writer's Guide # second the divisor is shifted intead of the dividend. # 96/64 and 128/64 bit divisions implemented as in the CWG # with considerqaon of potential carry bit in shifted dividend # past bit 63. # # Unnecessary saving of register R15 on stack suppressed in div96. # Additionally R14 is only used in the extended case in div96ext # March 22 clear renaming of routines and reordering in the files: # div.s: complete algorithms with no value limitation 64:64, 96:64, # 128:64 and 128:16 bits divisions. # div2.s: optimised versions for highest bit of divisor=0: 96:63 and # 128:63 bits divisions. In those cases the 32 and 64 upper bits in the # shifted dividend are always 0 saving a couple of instructions. # Added 128:95 division in which the 32 higher bits of shifted dividend # are always 0. # # Addition of a 128:128 bits division routine in div.s. # # April 2024 corrected missing compare in div642 when shifting left. # Simplification of the computation of the bit number to shift. # Addition of a 128/96 bits division routine. This time the register # shifting is done with reusable macros. # Those macros are provided in 2 versions: one if the source and destination # registers are the same and a second where they are different or may be the # same by defining an assembler switch. If they are different instruction # reorderings enable small performance optimisations. # # ToDo: implement a 96:96 bits division routine # #****/ dialect PowerPC file 'div.s' # Only the sections can be exported (dos not work with labels) # export div64[DS] export .div64[PR] => 'div_64' # export div642[DS] export .div642[PR] => 'div_64_2' # export div96_64[DS] export .div96_64[PR] => 'div_96_64' # export div128_64[DS] export .div128_64[PR] => 'div_128_64' export .div128_128[PR] => 'div_128_128' export .div128_96[PR] => 'div_128_96' # TOC and transition vectors only necessary if cross TOC calls are implemented (separate code fragments) toc div_64: tc div64[TC],div64[DS] div_642: tc div642[TC],div642[DS] div_96: tc div96_64[TC], div96_64[DS] div_128: tc div128_64[TC], div128_64[DS] linkageArea: equ 24 calleesParams: set 0 calleesLocalVars: set 0 numFPRs: set 0 # For 64 bits dividend csect div64[DS]# Transition vector for div64 dc.l .div64[PR] # Address of div64 dc.l TOC[tc0] # TOC address of div64 # Non volatile registers used : r31, r30 CTR (+SP) Algorithm 0 numGPRs: set 4 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .div64[PR] # Code section for div64 # Save used non volatile registers stw r30,-4(r1) stw r31,-8(r1) mfctr r30 # ctr volatile? stw r30,-12(SP) stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # R5 is now free. Load divisor into R5 & R6 lwz r6,4(r4) lwz r5,0(r4) # R4 is now free. Load dividend into R3 & R4 lwz r4,4(r3) lwz r3,0(r3) # Algorithm # --------- # (R3:R4) = (R3:R4) / (R5:R6) (64b) = (64b / 64b) # quo dvd dvs # # Remainder is returned in R5:R6. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # lsw = least-significant (low-order) word, i.e. bits 32..63 # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4 = dvd (input dividend); quo (output quotient) # R5:R6 = dvs (input divisor); rem (output remainder) # # R7:R8 = tmp # count the number of leading 0s in the dividend cmpwi cr0,R3,0 # dvd.msw == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ cntlzw R9,R4 # R9 = dvd.lsw.LZ bne cr0,lab1 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ addi R0,R9,32 # dvd.LZ = dvd.lsw.LZ + 32 lab1: # count the number of leading 0s in the divisor cmpwi cr0,R5,0 # dvd.msw == 0? cntlzw R9,R5 # R9 = dvs.msw.LZ cntlzw R10,R6 # R10 = dvs.lsw.LZ bne cr0,lab2 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R9,R10,32 # dvs.LZ = dvs.lsw.LZ + 32 lab2: # determine shift amounts to minimize the number of iterations # cmpw cr0,R0,R9 # compare dvd.LZ to dvs.LZ # subfic R10,R0,64 # R10 = dvd.SD # bgt cr0,lab9 # if(dvs > dvd) quotient = 0 # addi R9,R9,1 # ++dvs.LZ (or --dvs.SD) # subfic R9,R9,64 # R9 = dvs.SD # add R0,R0,R9 # (dvd.LZ + dvs.SD) = left shift of dvd for # # initialdvd # subf R9,R9,R10 # (dvd.SD - dvs.SD) = right shift of dvd for # # initialtmp # mtctr R9 # number of iterations = dvd.SD - dvs.SD # I don't know why in the cwg they made it so complicated. # This should also do cmpw cr0,r0,r9 # compare dvd.LZ to dvs.LZ addi r9,r9,1 # ++dvs.LZ (or --dvs.SD) as shift is # done at beginning of the division loop subf r9,r0,r9 # (dvs.LZ - dvd.LZ) = right shift of dvd for # initialtmp bgt cr0,lab9 # if(dvs > dvd) quotient = 0 subfic r0,r9,64 # 64-right shift = left shift of dvd for # initial dvd mtctr r9 # number of iterations = dvs.LZ - dvd.LZ + 1 # R7:R8 = R3:R4 >> R9 cmpwi cr0,R9,32 # compare R9 to 32 addi R7,R9,-32 blt cr0,lab3 # if(R9 < 32) jump to lab3 srw R8,R3,R7 # tmp.lsw = dvd.msw >> (R9 - 32) li R7,0 # tmp.msw = 0 b lab4 lab3: srw R8,R4,R9 # R8 = dvd.lsw >> R9 subfic R7,R9,32 slw R7,R3,R7 # R7 = dvd.msw << 32 - R9 or R8,R8,R7 # tmp.lsw = R8 | R7 srw R7,R3,R9 # tmp.msw = dvd.msw >> R9 lab4: # R3:R4 = R3:R4 << R0 cmpwi cr0,R0,32 # compare R0 to 32 addic R9,R0,-32 blt cr0,lab5 # if(R0 < 32) jump to lab5 slw R3,R4,R9 # dvd.msw = dvd.lsw << R9 li R4,0 # dvd.lsw = 0 b lab6 lab5: slw R3,R3,R0 # R3 = dvd.msw << R0 subfic R9,R0,32 srw R9,R4,R9 # R9 = dvd.lsw >> 32 - R0 or R3,R3,R9 # dvd.msw = R3 | R9 slw R4,R4,R0 # dvd.lsw = dvd.lsw << R0 lab6: # restoring division shift and subtract loop li R10,-1 # R10 = -1 addic R7,R7,0 # clear carry bit before loop starts lab7: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde R4,R4,R4 # shift dvd.lsw left 1 bit adde R3,R3,R3 # shift dvd.msw to left 1 bit adde R8,R8,R8 # shift tmp.lsw to left 1 bit adde R7,R7,R7 # shift tmp.msw to left 1 bit subfc R0,R6,R8 # tmp.lsw - dvs.lsw subfe. R9,R5,R7 # tmp.msw - dvs.msw blt cr0,lab8 # if(result< 0) clear carry bit mr R8,R0 # move lsw mr R7,R9 # move msw addic R0,R10,1 # set carry bit lab8: bdnz lab7 # write quotient and remainder adde R4,R4,R4 # quo.lsw (lsb = CA) adde R3,R3,R3 # quo.msw (lsb from lsw) mr R6,R8 # rem.lsw mr R5,R7 # rem.msw # blr # return b finish64_1 lab9: # Quotient is 0 (dvs > dvd) mr R6,R4 # rmd.lsw = dvd.lsw mr R5,R3 # rmd.msw = dvd.msw li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 # blr# return finish64_1: # Write results back into memory stw r3,0(r30) stw r4,4(r30) stw r5,0(r31) stw r6,4(r31) # Restore non volatile registers # lwz SP,0(SP) lwz r30,spaceToSaveAligned-12(SP) mtctr r30 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr # For 64 bits dividend, alternative csect div642[DS] csect .div642[PR] # Code section for div642 # Save used non volatile registers stw r30,-4(r1) stw r31,-8(r1) mfctr r30 # ctr volatile? stw r30,-12(SP) stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # R5 is now free. Load divisor into R5 & R6 lwz r6,4(r4) lwz r5,0(r4) # R4 is now free. Load dividend into R3 & R4 lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # count the number of leading 0s in the dividend cmpwi cr0,R3,0 # dvd.msw == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ cntlzw R9,R4 # R9 = dvd.lsw.LZ bne cr0,lab2_1 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ addi R0,R9,32 # dvd.LZ = dvd.lsw.LZ + 32 lab2_1: # count the number of leading 0s in the divisor cmpwi cr0,R5,0 # dvd.msw == 0? cntlzw R9,R5 # R9 = dvs.msw.LZ cntlzw R10,R6 # R10 = dvs.lsw.LZ bne cr0,lab2_2 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R9,R10,32 # dvs.LZ = dvs.lsw.LZ + 32 lab2_2: # determine shift amounts to minimize the number of iterations # cmpw cr0,R0,R9 # compare dvd.LZ to dvs.LZ # subfic R10,R0,64 # R10 = dvd.SD # bgt cr0,lab2_9 # if(dvs > dvd) quotient = 0 # subfic R9,R9,64 # R9 = dvs.SD # subf R9,R9,R10 # (dvd.SD - dvs.SD) = left shift of dvsr for # addi R10,R9,1 # mtctr R10 # number of iterations = dvd.SD - dvs.SD + 1 # I don't know why in the cwg they made it so complicated. # This should also do cmpw cr0,r0,r9 # compare dvd.LZ to dvs.LZ addi r9,r9,1 # ++dvs.LZ (or --dvs.SD) as shift is # done at beginning of the division loop subf r9,r0,r9 # (dvs.LZ - dvd.LZ) = right shift of dvd for # initialtmp bgt cr0,lab2_9 # if(dvs > dvd) quotient = 0 subfic r0,r9,64 # 64-right shift = left shift of dvd for # initial dvd mtctr r9 # number of iterations = dvs.LZ - dvd.LZ +1 # R5:R6 = R5:R6 << R9 cmpwi cr0,R9,32 # compare R9 to 32 blt cr0,lab2_3 # if(R9 < 32) jump to lab3 addi R0,R9,-32 slw R5,R6,R0 # R5 = R6 << R9 - 32 li R6,0 b lab2_6 lab2_3: slw R5,R5,R9 # R5 << R9 subfic R0,R9,32 # R9 = 32 - R9 srw R10,R6,R0 # Save upper R9 bits of R6 slw R6,R6,R9 # R6 << R9 or R5,R5,R10 # Copy upper R9 bits from R6 into R5 lab2_6: # restoring division shift and subtract loop li R10,-1 # R10 = -1 addic R7,R7,0 # clear carry bit before loop starts mr R7,R3 mr R8,R4 li R3,0 li R4,0 lab2_7: # subtract divisor from remainung dividend subfc R0,R6,R8 # tmp.lsw - dvs.lsw subfe. R9,R5,R7 # tmp.msw - dvs.msw blt cr0,lab2_8 # if(result< 0) clear carry bit mr R8,R0 # move lsw mr R7,R9 # move msw addic R0,R10,1 # set carry bit lab2_8: # Push new quotient bit into R3:R4 adde R4,R4,R4 # quo.lsw (lsb = CA) adde R3,R3,R3 # quo.msw (lsb from lsw) # R5:R6 = R5:R6 >> 1 # rlwnm R9,R5,31,0,0 # Save low order bit of R5 into high bit of R9 # srwi R5,R5,1 # srwi R6,R6,1 # or R6,R6,R9 # Store low order bit of R5 into R6 srwi R6,R6,1 rlwimi R6,R5,31,0,0 # Transfer low order bit of R5 into high bit of R6 srwi R5,R5,1 bdnz lab2_7 # write remainder mr R6,R8 # rem.lsw mr R5,R7 # rem.msw #blr # return b finish64_2 lab2_9: # Quotient is 0 (dvs > dvd) mr R6,R4 # rmd.lsw = dvd.lsw mr R5,R3 # rmd.msw = dvd.msw li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 #blr# return finish64_2: # Write results back into memory stw r3,0(r30) stw r4,4(r30) stw r5,0(r31) stw r6,4(r31) # Restore non volatile registers # lwz SP,0(SP) lwz r30,spaceToSaveAligned-12(SP) mtctr r30 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr # For 96 bits dividend csect div96_64[DS] # This algorithm uses non volatile registers R31, R30 and R13, R14 (pure algorithm) CTR? # and CR fields 2,3 numGPRs: set 7 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .div96_64[PR] # Code section for div96 # Save used non volatile registers stw R30,-4(r1) stw R31,-8(r1) mfctr r31 stw r31,-12(SP) mfcr r30 # stw r30,36(SP) stw r30,4(SP) # Condition register may be saved in linkage area stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # R6 is now free. Load divisor into R6 & R7 lwz r6,0(r4) lwz r7,4(r4) # R4 is now free. Load dividend into R3, R4 R5 lwz r5,8(r3) lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # (R3:R4:R5) = (R3:R4:R5) / (R6:R7) (96b) = (96b / 64b) # quo dvd dvs # # Remainder is returned in R6:R7. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw = least-significant (low-order) word, i.e. bits 64..96 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4:R5 = dvd (input dividend); quo (output quotient) # R6:R7 = dvs (input divisor); rem (output remainder) # # R8:R9:R10 = tmp # Note: after alignment of the most significat digits betzeen dividend and divisor, register # R8 will always be 0. # Algorithm uses non volatile registers R13 to R15 # Save them stw R13,-4(r1) stw R14,-8(r1) stw R15,-12(r1) # Option use a multiple word store (remplacer R11->R29 .. R15->R25) # stmw R27,0(r1) # count the number of leading 0s in the divisor (R11) cmpwi cr0,R6,0 # dvd.msw == 0? cntlzw R11,R6 # R11 = dvs.msw.LZ cntlzw R12,R7 # R12 = dvs.lsw.LZ bne cr0,lab96_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R11,R12,32 # dvs.LZ = dvs.lsw.LZ + 32 lab96_1: addi r11,r11,32 # Extend divisor to 3 words # count the number of leading 0s in the dividend (R0) cmpwi cr0,R3,0 # dvd.msw == 0? cmpwi cr2,R4,0 # dvd.msw2 == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ bne cr0,lab96_2 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ # Dividend has 32 <= LZ cntlzw R12,R4 # R12 = dvd.lsw.LZ addi R0,R12,32 # dvd.LZ = dvd.lsw.LZ + 32 bne cr2,lab96_2 # if(dvd.msw2 != 0) dvd.LZ = 32+dvd.msw2.LZ # Dividend has 64 <= LZ cntlzw R12,R5 # R9 = dvd.lsw.LZ addi R0,R12,64 # dvd.LZ = dvd.lsw.LZ + 64 lab96_2: # determine shift amounts to minimize the number of iterations # cmpw cr0,R0,R11 # compare dvd.LZ to dvs.LZ # subfic R12,R0,96 # R12 = dvd.SD # bgt cr0,lab96_20 # if(dvs > dvd) quotient = 0 # # addi R11,R11,1 # ++dvs.LZ (or --dvs.SD) # subfic R11,R11,96 # R11 = dvs.SD # add R0,R0,R11 # (dvd.LZ + dvs.SD) = left shift of dvd for # # initialdvd ## Perform comparisons for comming algor here # cmpwi cr4,R0,32 # compare R0 to 32 # cmpwi cr5,R0,64 # compare R0 to 64 # subf R11,R11,R12 # (dvd.SD - dvs.SD) = right shift of dvd for # # initialtmp # cmpwi cr6,R11,32 # compare R11 to 32 # cmpwi cr7,R11,64 # compare R11 to 64 # mtctr R11 # number of iterations = dvd.SD - dvs.SD # Simpler cmpw cr0,r0,r11 # compare dvd.LZ to dvs.LZ addi r11,r11,1 # ++dvs.LZ (or --dvs.SD) as shift is # done at beginning of the division loop subf r11,r0,r11 # (dvs.LZ - dvd.LZ) = right shift of dvd for # initialtmp bgt cr0,lab96_20 # if(dvs > dvd) quotient = 0 # Perform comparisons for comming algor here cmpwi cr6,R11,32 # compare R11 to 32 cmpwi cr7,R11,64 # compare R11 to 64 subfic r0,r11,96 # 96-right shift = left shift of dvd for # initial dvd mtctr r11 # number of iterations = dvs.LZ - dvd.LZ +1 cmpwi cr4,R0,32 # compare R0 to 32 cmpwi cr5,R0,64 # compare R0 to 64 # R8:R9:R10 = R3:R4:R5 >> R11 # From here we distinguish 3 cases: # Shift < 32: every register has to be shifted right by R11, and the R11 lower bits of the upper transferred. # 32 <= Shift < 64: the lowest register pushed out. The data to be shifted by R11-32 jumping one register. # otherwise, the 2 lower registers pushed out. Only R11-64 upper bits from the highest register to be put into the lowest. # The right shift of the 3 other registers is dependant on the left shift # Shift left >= 64 bits # Shift left 32 excl. to 64 incl. # Shift left <= 32 bits blt cr6,lab96_4 # if(R11 < 32) jump to lab4 blt cr7,lab96_5 # if(R11 < 64) jump to lab5 b lab96_6 # R11 >= 64 lab96_4: # First case srw R10,R5,R11 # R10 = dvd.lsw >> R11 subfic R12,R11,32 slw R13,R4,R12 # R13 = dvd.msw1 << 32 - R11 or R10,R10,R13 # tmp.lsw = R13 | R10 srw R9,R4,R11 # R9 = dvd.msw1 >> R11 slw R13,R3,R12 # R13 = dvd.msw << 32 - R11 or R9,R9,R13 # tmp.msw1 = R13 | R9 # R8 will be 0 srw R8,R3,R11 # tmp.msw = dvd.msw >> R11 b lab96_7 lab96_5: # Second case subfic R12,R11,64 # R12 = 64 - R11 addi R11,R11,-32 # R11 = R11 - 32 srw R10,R4,R11 # R10 = dvd.msw1 >> R11 - 32 slw R13,R3,R12 # R13 = dvd.msw << 64 - R11 or R10,R13,R10 # tmp.lsw = R14 | R10 srw R9,R3,R11 # tmp.msw1 = dvd.msv >> R11 - 32 # R8 will be 0 li R8,0 b lab96_7 lab96_6: # Third case addi R12,R11,-64 srw R10,R3,R12 # R10 = dvd.msw1 >> R11 - 64 # R8 will be 0 li R8,0 li R9,0 lab96_7: # R3:R4:R5 = R3:R4:R5 << R0 # same 3 cases as before blt cr4,lab96_8 # if(R0 < 32) jump to lab8 blt cr5,lab96_9 # if(R0 < 64) jump to lab9 b lab96_10 # R0 >= 64 lab96_8: # Shift left R3:R4:R5 by less than 32 bits! slw R3,R3,R0 # R3 = dvd.msw << R0 subfic R11,R0,32 srw R12,R4,R11 # R12 = dvd.msw1 >> 32 - R0 or R3,R3,R12 # dvd.msw = R3 | R12 slw R4,R4,R0 # dvd.msw1 = dvd.msw1 << R0 srw R12,R5,R11 # R12 = dvd.lsw >> 32 - R0 or R4,R4,R12 # dvd.msw1 = R4 | R12 slw R5,R5,R0 # dvd.lsw = dvd.lsw << R0 b lab96_11 lab96_9: # Shift left R3:R4:R5 by 32 incl. to 64 excl. bits! subfic R13,R0,64 addic R11,R0,-32 # Not use addi!!! => R11=-32 slw R3,R4,R11 # R3 = dvd.lsw << R0 - 32 srw R12,R5,R13 # R12 = dvd.lsw >> 64 - R0 or R3,R3,R12 # R3 = R3 | R12 slw R4,R5,R11 # R4 = R5 << R0 - 32 li R5,0 b lab96_11 lab96_10: # Shift left R3:R4:R5 by more than 64 bits! addic R13,R0,-64 # Not use addi!!! 0=> R13=-64 slw R3,R5,R13 li R5,0 li R4,0 lab96_11: # restoring division shift and subtract loop li R11,-1 # R11 = -1 addic R7,R7,0 # clear carry bit before loop starts li R13,0 lab96_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde R5,R5,R5 # shift tmp.lsw3 left 1 bit adde R4,R4,R4 # shift tmp.lsw2 left 1 bit adde R3,R3,R3 # shift tmp.lsw1 to left 1 bit adde R10,R10,R10 # shift tmp.low to left 1 bit adde R9,R9,R9 # shift tmp.msw2 to left 1 bit # R8 will mostly be 0, but might get a carry into its lowest bit! adde R8,R8,R8 # shift tmp.high to left 1 bit subfc R0,R7,R10 # tmp.lsw - dvs.lsw # subfe. R12,R6,R9 # tmp.middle - dvs.msw subfe R12,R6,R9 # tmp.high - dvs.msw # In case a bit is carried into the higher word. Just take this possible # carry into account, by subtracting 0 from R8. subfe. R14,R13,R8 # tmp - 0 + carry blt cr0,lab96_13 # if(result< 0) clear carry bit mr R10,R0 # move lsw mr R9,R12 # move msw # R8 will then be 0 again. mr R8,R14 # or li R8,0 # Only lsb might be set addic R0,R11,1 # set carry bit lab96_13: bdnz lab96_12 # write quotient and remainder adde R5,R5,R5 # quo.lsw (lsb = CA) adde R4,R4,R4 # quo.msw1 (lsb = CA) adde R3,R3,R3 # quo.msw (lsb from lsw) mr R7,R10 # rem.lsw mr R6,R9 # rem.msw #blr # return b finish_96 lab96_20: # Quotient is 0 (dvs > dvd) mr R7,R5 # rmd.lsw = dvd.lsw mr R6,R4 # rmd.msw = dvd.msw1 li R5,0 # dvd.msw = 0 li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 #blr# return finish_96: # Restore non volatile registers lwz R13,-4(r1) lwz R14,-8(r1) lwz R15,-12(r1) # Option use a multiple word store (remplacer R11->R31 .. R15->R27 # lmw R27,0(r1) # Write results back into memory stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) stw r6,0(r31) stw r7,4(r31) # Restore non volatile registers # lwz SP,0(SP) lwz r31,spaceToSaveAligned+4(SP) mtcrf 2,r31 mtcrf 4,r31 lwz r30,spaceToSaveAligned-12(SP) mtctr r30 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr # For 128 bits dividend csect div128_64[DS] # This algorithm uses non volatile registers R31, R30 and R29, R28, R27 (pure algorithm) # and CR fields 2,3 numGPRs: set 6 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .div128_64[PR] # Code section for div128 # Save used non volatile registers stw R30,-4(r1) stw R31,-8(r1) mfcr r30 stw r30,4(SP) # Condition register may be saved in linkage area stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # Load divisor into R7 & R8 lwz r7,0(r4) lwz r8,4(r4) # R6 & R4 are now free. Load dividend into R3, R4 R5 R6 lwz r6,12(r3) lwz r5,8(r3) lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # (R3:R4:R5:R6) = (R3:R4:R5:R6) / (R7:R8) (128b) = (128b / 64b) # quo dvd dvs # # Remainder is returned in R7:R8. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw1 = least-significant (low-order) word, i.e. bits 64..96 # lsw = least-significant (low-order) word, i.e. bits 97..128 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4:R5:R6 = dvd (input dividend); quo (output quotient) # R7:R8 = dvs (input divisor); rem (output remainder) # # R9:R10:R11:R12 = tmp # Note: after alignment of the most significat digits betzeen dividend and divisor, register # R9 and R10 will always be 0. # Algorithm uses non volatile registers R27 to R31 # Save them stw R27,-4(r1) stw R28,-8(r1) stw R29,-12(r1) # stw R30,-16(r1) done before # stw R31,-18(r1) done before # count the number of leading 0s in the divisor (R29) cmpwi cr0,R7,0 # dvd.msw == 0? cntlzw R29,R7 # R29 = dvs.msw.LZ cntlzw R12,R8 # R12 = dvs.lsw.LZ bne cr0,lab128_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R29,R12,32 # dvs.LZ = dvs.lsw.LZ + 32 lab128_1: addi r29,r29,64 # Extend divisor to 4 words # count the number of leading 0s in the dividend (R0) cmpwi cr0,R3,0 # dvd.msw == 0? cmpwi cr2,R4,0 # dvd.msw2 == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ bne cr0,lab128_2 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ # Dividend has 32 <= LZ cmpwi cr3,R5,0 # dvd.lsw1 == 0? cntlzw R12,R4 # R12 = dvd.lsw.LZ addi R0,R12,32 # dvd.LZ = dvd.lsw.LZ + 32 bne cr2,lab128_2 # if(dvd.msw2 != 0) dvd.LZ = 32+dvd.msw2.LZ # Dividend has 64 <= LZ cntlzw R12,R5 # R12 = dvd.lsw.LZ addi R0,R12,64 # dvd.LZ = dvd.lsw.LZ + 64 bne cr3,lab128_2 # if(dvd.lsw1 != 0) dvd.LZ = 64+dvd.lsw1.LZ # Dividend has LZ >= 96 cntlzw R12,R6 # R12 = dvd.lsw.LZ addi R0,R12,96 # dvd.LZ = dvd.lsw.LZ + 96 lab128_2: # determine shift amounts to minimize the number of iterations # cmpw cr0,R0,R29 # compare dvd.LZ to dvs.LZ # subfic R10,R0,128 # R10 = dvd.SD # bgt cr0,lab128_14 # if(dvs > dvd) quotient = 0 # addi R29,R29,1 # ++dvs.LZ (or --dvs.SD) # subfic R29,R29,128 # R9 = dvs.SD # add R0,R0,R29 # (dvd.LZ + dvs.SD) = left shift of dvd for # # initialdvd # subf R29,R29,R10 # (dvd.SD - dvs.SD) = right shift of dvd for # # initialtmp # mtctr R29 # number of iterations = dvd.SD - dvs.SD # Simpler cmpw cr0,r0,r29 # compare dvd.LZ to dvs.LZ addi r29,r29,1 # ++dvs.LZ (or --dvs.SD) as shift is # done at beginning of the division loop subf r29,r0,r29 # (dvs.LZ - dvd.LZ +1) = right shift of dvd for # initialtmp bgt cr0,lab128_2_14 # if(dvs > dvd) quotient = 0 subfic r0,r29,128 # 128-right shift = left shift of dvd for # initial dvd mtctr r29 # number of iterations = dvs.LZ - dvd.LZ +1 # R9:R10:R11:R12 = R3:R4:R5:R6 >> r29 # Shifting right 4 registers wxyz by r29 #--------------------------------------- cmpwi cr7,r29,96 # compare r29 to 96 cmpwi cr6,r29,64 # compare r29 to 64 cmpwi cr5,r29,32 # compare r29 to 32 bge cr7,labSR3 bge cr6,labSR2 bge cr5,labSR1 # Shift < 32 bits srw r12,r6,r29 subfic r28,r29,32 # r28 = 32 - r29 slw r27,r5,r28 or r12,r27,r12 srw r11,r5,r29 slw r27,r4,r28 or r11,r27,r11 # R9 and R10 will be 0 # srw r10,r4,r29 # slw r27,r3,r28 # or r10,r27,r10 li R10,0 # srw r9,r3,r29 b lab128_7 labSR1: # 32 <= Shift < 64 subfic r28,r29,64 # r28 = 64 - r29 addi r29,r29,-32 # r29 = r29 - 32 srw r12,r5,r29 slw r27,r4,r28 or r12,r27,r12 srw r11,r4,r29 slw r27,r3,r28 or r11,r27,r11 # R9 and R10 will be 0 # srw r10,r3,r29 li R10,0 # li r9,0 b lab128_7 labSR2: # 64 <= Shift < 96 subfic r28,r29,96 # r28 = 96 - r29 addi r29,r29,-64 # r29 = r29 - 64 srw r12,r4,r29 slw r27,r3,r28 or r12,r27,r12 srw r11,r3,r29 # R9 and R10 will be 0 li R10,0 # li R9,0 b lab128_7 labSR3: # 96 <= Shift addi r29,r29,-96 # r29 = r29 - 96 srw r12,r3,r29 li r11,0 # R9 and R10 will be 0 li r10,0 # li r9,0 lab128_7: # R3:R4:R5:R6 = R3:R4:R5:R6 << R0 # Shifting left 4 registers wxyz by R0 #--------------------------------------- cmpwi cr7,R0,96 # compare R0 to 96 cmpwi cr6,R0,64 # compare R0 to 64 cmpwi cr5,R0,32 # compare R0 to 32 bge cr7,labSL3 bge cr6,labSL2 bge cr5,labSL1 # Shift < 32 bits slw r3,r3,r0 subfic r28,r0,32 # r28 = 32 - r0 srw r27,r4,r28 or r3,r3,r27 slw r4,r4,r0 srw r27,r5,r28 or r4,r4,r27 slw r5,r5,r0 srw r27,r6,r28 or r5,r5,r27 slw r6,r6,r0 b lab128_11 labSL1: # 32 <= Shift < 64 subfic r28,r0,64 # r28 = 64 - r0 addic r0,r0,-32 # r0 = r0 - 32 Not use addi!!! 0=> r0=-64 slw r3,r4,r0 srw r27,r5,r28 or r3,r3,r27 slw r4,r5,r0 srw r27,r6,r28 or r4,r4,r27 slw r5,r6,r0 li r6,0 b lab128_11 labSL2: # 64 <= Shift < 96 subfic r28,r0,96 # r28 = 96 - r0 addic r0,r0,-64 # r0 = r0 - 64 Not use addi!!! 0=> r0=-64 slw r3,r5,R0 srw r27,r6,r28 or r3,r3,r27 slw r4,r6,R0 li r5,0 li r6,0 b lab128_11 labSL3: # 96 <= Shift addic r0,r0,-96 # r0 = r0 - 96 Not use addi!!! 0=> r0=-96 slw r3,r6,r0 li R4,0 li R5,0 li R6,0 lab128_11: # restoring division shift and subtract loop li R29,-1 # R29 = -1 addic R7,R7,0 # clear carry bit before loop starts li r9,0 lab128_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde R6,R6,R6 # shift dvd.lsw4 left 1 bit adde R5,R5,R5 # shift dvd.lsw3 left 1 bit adde R4,R4,R4 # shift dvd.lsw2 left 1 bit adde R3,R3,R3 # shift dvd.lsw1 to left 1 bit adde R12,R12,R12 # shift tmp.msw4 to left 1 bit adde R11,R11,R11 # shift tmp.msw3 to left 1 bit # R9 and R10 will be 0, but R10 might get a carry into its lowest bit! adde R10,R10,R10 # shift tmp.middle_u to left 1 bit # adde R9,R9,R9 # shift tmp.high to left 1 bit subfc R0,R8,R12 # tmp.lsw - dvs.lsw # subfe. R28,R7,R11 # tmp.msw - dvs.msw subfe R28,R7,R11 # tmp.msw - dvs.msw # In case a bit is carried into the higher word. Just take this possible # carry into account, by subtracting 0 from R10. subfe. R27,R9,R10 # tmp - 0 + carry blt cr0,lab128_13 # if(result< 0) clear carry bit mr R12,R0 # move lsw mr R11,R28 # move msw # R10 will thne be 0 agin mr R10,R27 # or li R10,0 # Only lsb might be set addic R0,R29,1 # set carry bit lab128_13: bdnz lab128_12 # write quotient and remainder adde R6,R6,R6 # quo.lsw (lsb = CA) adde R5,R5,R5 # quo.sw_l (lsb = CA) adde R4,R4,R4 # quo.sw_u (lsb = CA) adde R3,R3,R3 # quo.msw (lsb from lsw) # mr R8,R12 # rem.lsw # mr R7,R11 # rem.msw b finish_128 #blr # return lab128_14: # Quotient is 0 (dvs > dvd) mr R12,R6 # rmd.lsw mr R11,R5 # rmd.msw li R6,0 # dvd.msw = 0 li R5,0 # dvd.msw = 0 li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 finish_128: # Write results back into memory stw r11,0(r31) stw r12,4(r31) stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) stw r6,12(r30) # Restore non vol registers lwz R27,-4(r1) lwz R28,-8(r1) lwz R29,-12(r1) # lwz R30,-16(r1) # lwz R31,-18(r1) # Restore non volatile registers lwz r31,spaceToSaveAligned+4(SP) mtcrf 2,r31 mtcrf 3,r31 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr # For 128 bits dividend and divisor csect div128_128[DS] # This algorithm uses non volatile registers R31, R30 and R29, R28, R27 (pure algorithm) # and CR fields 2,3 numGPRs: set 6 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .div128_128[PR] # Code section for div128_128 # Save used non volatile registers stw R30,-4(r1) stw R31,-8(r1) # mfcr r30 # stw r30,4(SP) # Condition register may be saved in linkage area # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # Load divisor into R7, R8, R9 & R10 lwz r7,0(r4) lwz r8,4(r4) lwz r9,8(r4) lwz r10,12(r4) # R6 & R4 are now free. Load dividend into R3, R4 R5 R6 lwz r6,12(r3) lwz r5,8(r3) lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # (R3:R4:R5:R6) = (R3:R4:R5:R6) / (R7:R8:R9:R10) (128b) = (128b / 128b) # quo dvd dvs # # Remainder is returned in R7:R8. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw1 = least-significant (low-order) word, i.e. bits 64..96 # lsw = least-significant (low-order) word, i.e. bits 97..128 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4:R5:R6 = dvd (input dividend) quo (output quotient) # R7:R8:R9:R10 = dvs (input divisor) rem (output remainder) # # R11:R12:R13:R14 = tmp # Algorithm uses non volatile registers R27 to R31 # Save them stw r26,-32(r1) stw r27,-12(r1) stw r28,-16(r1) stw r29,-20(r1) stw r13,-24(r1) stw r14,-28(r1) # count the number of leading 0s in the divisor (R29) cmpwi cr5,r7,0 # dvs.msw == 0? cmpwi cr6,r8,0 # dvs.msw2 == 0? cntlzw r29,r7 # R29 = dvd.msw.LZ bne cr5,lab128_2_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ # Divisor has 32 <= LZ cmpwi cr7,r9,0 # dvd.msw == 0? cntlzw r27,r8 # R27 = dvs.msw.LZ addi r29,r27,32 # dvs.LZ = dvs.lsw.LZ + 32 bne cr6,lab128_2_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ # Divisor has 64 <= LZ cntlzw r28,r9 # R28 = dvd.msw.LZ addi r29,r28,64 # dvd.LZ = dvd.lsw.LZ + 64 bne cr7,lab128_2_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ # Divisor has LZ >= 96 cntlzw r28,r10 # R12 = dvd.lsw.LZ addi r29,r12,96 # dvs.LZ = dvs.lsw.LZ + 96 lab128_2_1: # count the number of leading 0s in the dividend (R0) cmpwi cr0,r3,0 # dvd.msw == 0? cmpwi cr5,r4,0 # dvd.msw2 == 0? cntlzw r0,r3 # R0 = dvd.msw.LZ bne cr0,lab128_2_2 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ # Dividend has 32 <= LZ cmpwi cr6,r5,0 # dvd.lsw1 == 0? cntlzw r12,r4 # R12 = dvd.lsw.LZ addi r0,r12,32 # dvd.LZ = dvd.lsw.LZ + 32 bne cr5,lab128_2_2 # if(dvd.msw2 != 0) dvd.LZ = 32+dvd.msw2.LZ # Dividend has 64 <= LZ cntlzw r12,r5 # R12 = dvd.lsw.LZ addi r0,r12,64 # dvd.LZ = dvd.lsw.LZ + 64 bne cr6,lab128_2_2 # if(dvd.lsw1 != 0) dvd.LZ = 64+dvd.lsw1.LZ # Dividend has LZ >= 96 cntlzw r12,r6 # R12 = dvd.lsw.LZ addi r0,r12,96 # dvd.LZ = dvd.lsw.LZ + 96 lab128_2_2: # determine shift amounts to minimize the number of iterations # cmpw cr0,r0,r29 # compare dvd.LZ to dvs.LZ # subfic r28,r0,128 # R28 = dvd.SD # bgt cr0,lab128_2_14 # if(dvs > dvd) quotient = 0 # addi r29,r29,1 # ++dvs.LZ (or --dvs.SD) # subfic r29,r29,128 # R9 = dvs.SD # add r0,r0,r29 # (dvd.LZ + dvs.SD) = left shift of dvd for # # initialdvd # subf r29,r29,r28 # (dvd.SD - dvs.SD) = right shift of dvd for # # initialtmp # mtctr r29 # number of iterations = dvd.SD - dvs.SD # Simpler cmpw cr0,r0,r29 # compare dvd.LZ to dvs.LZ addi r29,r29,1 # ++dvs.LZ (or --dvs.SD) as shift is # done at beginning of the division loop subf r29,r0,r29 # (dvs.LZ - dvd.LZ +1) = right shift of dvd for # initialtmp bgt cr0,lab128_2_14 # if(dvs > dvd) quotient = 0 subfic r0,r29,128 # 128-right shift = left shift of dvd for # initial dvd mtctr r29 # number of iterations = dvs.LZ - dvd.LZ +1 # R11:R12:R13:R14 = R3:R4:R5:R6 >> r29 # Shifting right 4 registers wxyz by r29 #--------------------------------------- cmpwi cr7,r29,96 # compare r29 to 96 cmpwi cr6,r29,64 # compare r29 to 64 cmpwi cr5,r29,32 # compare r29 to 32 bge cr7,labSR3_2 bge cr6,labSR2_2 bge cr5,labSR1_2 # Shift < 32 bits srw r14,r6,r29 subfic r28,r29,32 # r28 = 32 - r29 slw r27,r5,r28 or r14,r27,r14 srw r13,r5,r29 slw r27,r4,r28 or r13,r27,r13 srw r12,r4,r29 slw r27,r3,r28 or r12,r27,r12 srw r11,r3,r29 b lab128_2_7 labSR1_2: # 32 <= Shift < 64 subfic r28,r29,64 # r28 = 64 - r29 addi r29,r29,-32 # r29 = r29 - 32 srw r14,r5,r29 slw r27,r4,r28 or r14,r27,r14 srw r13,r4,r29 slw r27,r3,r28 or r13,r27,r13 srw r12,r3,r29 li r11,0 b lab128_2_7 labSR2_2: # 64 <= Shift < 96 subfic r28,r29,96 # r28 = 96 - r29 addi r29,r29,-64 # r29 = r29 - 64 srw r14,r4,r29 slw r27,r3,r28 or r14,r27,r14 srw r13,r3,r29 li R12,0 li R11,0 b lab128_2_7 labSR3_2: # 96 <= Shift addi r29,r29,-96 # r29 = r29 - 96 srw r14,r3,r29 li r13,0 li r12,0 li r11,0 lab128_2_7: # R3:R4:R5:R6 = R3:R4:R5:R6 << R0 # Shifting left 4 registers wxyz by R0 #--------------------------------------- cmpwi cr7,r0,96 # compare R0 to 96 cmpwi cr6,r0,64 # compare R0 to 64 cmpwi cr5,r0,32 # compare R0 to 32 bge cr7,labSL3_2 bge cr6,labSL2_2 bge cr5,labSL1_2 # Shift < 32 bits slw r3,r3,r0 subfic r28,r0,32 # r28 = 32 - r0 srw r27,r4,r28 or r3,r3,r27 slw r4,r4,r0 srw r27,r5,r28 or r4,r4,r27 slw r5,r5,r0 srw r27,r6,r28 or r5,r5,r27 slw r6,r6,r0 b lab128_2_11 labSL1_2: # 32 <= Shift < 64 subfic r28,r0,64 # r28 = 64 - r0 addic r0,r0,-32 # r0 = r0 - 32 Not use addi!!! 0=> r0=-64 slw r3,r4,r0 srw r27,r5,r28 or r3,r3,r27 slw r4,r5,r0 srw r27,r6,r28 or r4,r4,r27 slw r5,r6,r0 li r6,0 b lab128_2_11 labSL2_2: # 64 <= Shift < 96 subfic r28,r0,96 # r28 = 96 - r0 addic r0,r0,-64 # r0 = r0 - 64 Not use addi!!! 0=> r0=-64 slw r3,r5,R0 srw r27,r6,r28 or r3,r3,r27 slw r4,r6,R0 li r5,0 li r6,0 b lab128_2_11 labSL3_2: # 96 <= Shift addic r0,r0,-96 # r0 = r0 - 96 Not use addi!!! 0=> r0=-96 slw r3,r6,r0 li r4,0 li r5,0 li r6,0 lab128_2_11: # restoring division shift and subtract loop li R29,-1 # R29 = -1 addic R7,R7,0 # clear carry bit before loop starts lab128_2_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde R6,R6,R6 # shift dvd.lsw4 left 1 bit adde R5,R5,R5 # shift dvd.lsw3 left 1 bit adde R4,R4,R4 # shift dvd.lsw2 left 1 bit adde R3,R3,R3 # shift dvd.lsw1 to left 1 bit adde R14,R14,R14 # shift tmp.msw4 to left 1 bit adde R13,R13,R13 # shift tmp.msw3 to left 1 bit adde R12,R12,R12 # shift tmp.middle_u to left 1 bit adde R11,R11,R11 # shift tmp.high to left 1 bit subfc R0,R10,R14 # tmp.lsw - dvs.lsw subfe R28,R9,R13 # tmp.msw - dvs.msw subfe R27,R8,R12 # tmp subfe. R26,R7,R11 # tmp blt cr0,lab128_2_13 # if(result< 0) clear carry bit mr R14,R0 # move lsw mr R13,R28 # move msw mr R12,R27 mr R11,R26 addic R0,R29,1 # set carry bit lab128_2_13: bdnz lab128_2_12 # write quotient and remainder adde R6,R6,R6 # quo.lsw (lsb = CA) adde R5,R5,R5 # quo.sw_l (lsb = CA) adde R4,R4,R4 # quo.sw_u (lsb = CA) adde R3,R3,R3 # quo.msw (lsb from lsw) b finish_128_2 #blr # return lab128_2_14: # Quotient is 0 (dvs > dvd) mr R14,R6 # rmd.lsw mr R13,R5 # rmd.m_l_w mr R12,R4 # rmd.m_h_w mr R11,R3 # rmd.msw li R6,0 # dvd.msw = 0 li R5,0 # dvd.msw = 0 li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 finish_128_2: # Write results back into memory stw r11,0(r31) stw r12,4(r31) stw r13,8(r31) stw r14,12(r31) stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) stw r6,12(r30) # Restore non vol registers lwz r26,-32(r1) lwz r27,-12(r1) lwz r28,-16(r1) lwz r29,-20(r1) lwz r13,-24(r1) lwz r14,-28(r1) # Restore non volatile registers lwz r30,-4(r1) lwz r31,-8(r1) blr # Last added a 128 b/96 b division. Maybe useful in special cases. # This time the shifts are implemented as macros that can be resused. # # 4 Register shifting macro implementation: # Shift is range [0;128[ # Shifting right 4 registers wxyz # Source and target registers are the same # Uses CR5 to CR7 and modifies ,&Rtp1, &Rtp2, &Rnum ! #---------------------------------------------------- # Shifting right 4 registers wxyz with Rnum. # R10, R11 used for temp data MACRO ShiftRight &Rw,&Rx,&Ry,&Rz,&Rtp1,&Rtp2,&Rnum cmpwi cr7,&Rnum,96 # compare Rnum to 96 cmpwi cr6,&Rnum,64 # compare Rnum to 64 cmpwi cr5,&Rnum,32 # compare Rnum to 32 bge cr7,Mlab_SR3 bge cr6,Mlab_SR2 bge cr5,Mlab_SR1 .; Shift < 32 bits srw &Rz,&Rz,&Rnum subfic &Rtp1,&Rnum,32 # rtp1 = 32 - &Rnum slw &Rtp2,&Ry,&Rtp1 or &Rz,&Rtp2,&Rz srw &Ry,&Ry,&Rnum slw &Rtp2,&Rx,&Rtp1 or &Ry,&Rtp2,&Ry srw &Rx,&Rx,&Rnum slw &Rtp2,&Rw,&Rtp1 or &Rx,&Rtp2,&Rx srw &Rw,&Rw,&Rnum b Mlab_SR_end Mlab_SR1: .; 32 <= Shift < 64 subfic &Rtp1,&Rnum,64 # rtp1 = 64 - &Rnum addic &Rnum,&Rnum,-32 # &Rnum = &Rnum - 32 srw &Rz,&Ry,&Rnum slw &Rtp2,&Rx,&Rtp1 or &Rz,&Rtp2,&Rz srw &Ry,&Rx,&Rnum slw &Rtp2,&Rw,&Rtp1 or &Ry,&Rtp2,&Ry srw &Rx,&Rw,&Rnum li &Rw,0 b Mlab_SR_end Mlab_SR2: .; 64 <= Shift < 96 subfic &Rtp1,&Rnum,96 # rtp1 = 96 - &Rnum addic &Rnum,&Rnum,-64 # &Rnum = &Rnum - 64 srw &Rz,&Rx,&Rnum slw &Rtp2,&Rw,&Rtp1 or &Rz,&Rtp2,&Rz srw &Ry,&Rw,&Rnum li &Rx,0 li &Rw,0 b Mlab_SR_end Mlab_SR3: .; 96 <= Shift addic &Rnum,&Rnum,-96 # &Rnum = &Rnum - 96 li &Ry,0 li &Rx,0 srw &Rz,&Rw,&Rnum li &Rw,0 MlabSR_end: ENDM # Shifting left 4 registers wxyz # Source and target registers are the same # Uses CR5 to CR7 and modifies ,&Rtp1, &Rtp2, &Rnum ! #---------------------------------------------------- # Shifting left 4 registers wxyz MACRO ShiftLeft &Rw,&Rx,&Ry,&Rz,&Rtp1,&Rtp2,&Rnum cmpwi cr7,&Rnum,96 # compare R9 to 96 cmpwi cr6,&Rnum,64 # compare R9 to 64 cmpwi cr5,&Rnum,32 # compare R9 to 32 bge cr7,Mlab_SL3 bge cr6,Mlab_SL2 bge cr5,Mlab_SL1 .; Shift < 32 bits slw &Rw,&Rw,&Rnum subfic &Rtp1,&Rnum,32 # rtp1 = 32 - &Rnum srw &Rtp2,&Rx,&Rtp1 or &Rw,&Rw,&Rtp2 slw &Rx,&Rx,&Rnum srw &Rtp2,&Ry,&Rtp1 or &Rx,&Rx,&Rtp2 slw &Ry,&Ry,&Rnum srw &Rtp2,&Rz,&Rtp1 or &Ry,&Ry,&Rtp2 slw &Rz,&Rz,&Rnum b Mlab_SL_end Mlab_SL1: .; 32 <= Shift < 64 subfic &Rtp1,&Rnum,64 # rtp1 = 64 - &Rnum addic &Rnum,&Rnum,-32 # &Rnum = &Rnum - 32 slw &Rw,&Rx,&Rnum srw &Rtp2,&Ry,&Rtp1 or &Rw,&Rw,&Rtp2 slw &Rx,&Ry,&Rnum srw &Rtp2,&Rz,&Rtp1 or &Rx,&Rx,&Rtp2 slw &Ry,&Rz,&Rnum li &Rz,0 b Mlab_SL_end Mlab_SL2: .; 64 <= Shift < 96 subfic &Rtp1,&Rnum,96 # rtp1 = 96 - &Rnum addic &Rnum,&Rnum,-64 # &Rnum = &Rnum - 64 slw &Rw,&Ry,&Rnum srw &Rtp2,&Rz,&Rtp1 or &Rw,&Rw,&Rtp2 slw &Rx,&Rz,&Rnum li &Ry,0 li &Rz,0 b Mlab_SL_end Mlab_SL3: .; 96 <= Shift addic &Rnum,&Rnum,-96 # &Rnum = &Rnum - 96 li &Rx,0 li &Ry,0 slw &Rw,&Rz,&Rnum li &Rz,0 Mlab_SL_end: ENDM # Shifting right 4 registers abcd # Source and target registers are different #----------------------------------------- # Arguments ra,rb,rc,rd,ri,rj,rk,rl,r?,rs,rsh. Additionally to ri-rl modifies r?, rs and rsh! # we may have ra,rb,rc,rd = ri,rj,rk,rl if switch ORDER_OPTIMISATION is not defined! # Definig this switch allows reordering instructions # 4 registers have to be shifted right by Rsh bits # We have 4 cases right shift <32, >=32 & <64, >=64 & <96 and >=96 MACRO SHIFT_RIGHT &ra,&rb,&rc,&rd,&ri,&rj,&rk,&rl,&rtp,&rs,&rsh cmpwi cr1,&rsh,32 cmpwi cr5,&rsh,64 cmpwi cr6,&rsh,96 blt cr1, lab_SR2_3 blt cr5, lab_SR2_2 blt cr6, lab_SR2_1 IFDEF ORDER_OPTIMISATION ; Here ra,rb,rc,rd = ri,rj,rk,rl is not allowed .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=96) addic &rs,&rsh,-96 xor &ri,&ri,&ri xor &rj,&rj,&rj xor &rk,&rk,&rk srw &rl,&ra,&rs b lab_SR2_end lab_SR2_1: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=64) addic &rs,&rsh,-64 xor &ri,&ri,&ri subfic &rsh,&rsh,96 xor &rj,&rj,&rj srw &rl,&rb,&rs slw &rtp,&ra,&rsh or &rl,&rtp,&rl srw &rk,&ra,&rs b lab_SR2_end lab_SR2_2: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=32) addic &rs,&rsh,-32 xor &ri,&ri,&ri subfic &rsh,&rsh,64 srw &rl,&rc,&rs slw &rtp,&rb,&rsh or &rl,&rtp,&rl srw &rk,&rb,&rs slw &rtp,&ra,&rsh or &rk, &rtp,&rk srw &rj,&ra,&rs b lab_SR2_end ELSE .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=96) addic &rs,&rsh,-96 xor &rj,&rj,&rj xor &rk,&rk,&rk srw &rl,&ra,&rs xor &ri,&ri,&ri b lab_SR2_end lab_SR2_1: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=64) addic &rs,&rsh,-64 subfic &rsh,&rsh,96 srw &rl,&rb,&rs xor &rj,&rj,&rj slw &rtp,&ra,&rsh or &rl,&rtp,&rl srw &rk,&ra,&rs xor &ri,&ri,&ri b lab_SR2_end lab_SR2_2: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=32) addic &rs,&rsh,-32 subfic &rsh,&rsh,64 srw &rl,&rc,&rs slw &rtp,&rb,&rsh or &rl,&rtp,&rl srw &rk,&rb,&rs slw &rtp,&ra,&rsh or &rk, &rtp,&rk srw &rj,&ra,&rs xor &ri,&ri,&ri b lab_SR2_end ENDIF lab_SR2_3: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift <32) subfic &rs,&rsh,32 srw &rl,&rd,&rsh slw &rtp,&rc,&rs or &rl,&rtp,&rl srw &rk,&rc,&rsh slw &rtp,&rb,&rs or &rk,&rtp,&rk srw &rj,&rb,&rsh slw &rtp,&ra,&rs or &rj,&rtp,&rj srw &ri,&ra,&rsh lab_SR2_end: ENDM # Arguments ra,rb,rc,rd,rw,rx,ry,rz,r?,rs,rsh. Additionally to ri-rl modifies r?, rs and rsh! # we may have ra,rb,rc,rd = rw,rx,ry,rz if switch ORDER_OPTIMISATION is not defined! # Definig this switch allows reordering instructions # 4 registers have to be shifted left by Rsh bits # We have 4 cases left shift <32, >=32 & <64, >=64 & <96 and >=96 MACRO SHIFT_LEFT &ra,&rb,&rc,&rd,&rw,&rx,&ry,&rz,&rtp,&rs,&rsh cmpwi cr6,&rsh,96 cmpwi cr5,&rsh,64 cmpwi cr1,&rsh,32 bge cr6, lab_SL2_3 bge cr5, lab_SL2_2 bge cr1, lab_SL2_1 .; RwRxRyRz = RaRbRcRd << (Rsh) (Shift <32) subfic &rs,&rsh,32 slw &rw,&ra,&rsh srw &rtp,&rb,&rs or &rw,&rtp,&rw slw &rx,&rb,&rsh srw &rtp,&rc,&rs or &rx,&rtp,&rx slw &ry,&rc,&rsh srw &rtp,&rd,&rs or &ry,&rtp,&ry slw &rz,&rd,&rsh b lab_SL2_end IFDEF ORDER_OPTIMISATION ; Here ra,rb,rc,rd = ri,rj,rk,rl is not allowed lab_SL2_1: .; RwRxRyRz = RaRbRcRd << (Rsh) (32<= Shift <64) addic &rsh,&rsh,-32 subfic &rs,&rsh,32 slw &rw,&rb,&rsh srw &rtp,&rc,&rs or &rw,&rtp,&rw slw &rx,&rc,&rsh srw &rtp,&rd,&rs or &rx,&rtp,&rx slw &ry,&rd,&rsh xor &rz,&rz,&rz b lab_SL2_end lab_SL2_2: .; RwRxRyRz = RaRbRcRd << (Rsh) (64<= Shift <96) addic &rsh,&rsh,-64 subfic &rs,&rsh,32 slw &rw,&rc,&rsh srw &rtp,&rd,&rs or &rw,&rtp,&rw slw &rx,&rd,&rsh xor &ry,&ry,&ry xor &rz,&rz,&rz b lab_SL2_end ELSE lab_SL2_1: .; RwRxRyRz = RaRbRcRd << (Rsh) (32<= Shift <64) addic &rsh,&rsh,-32 subfic &rs,&rsh,32 slw &rw,&rb,&rsh srw &rtp,&rc,&rs or &rw,&rtp,&rw slw &rx,&rc,&rsh srw &rtp,&rd,&rs or &rx,&rtp,&rx slw &ry,&rd,&rsh xor &rz,&rz,&rz b lab_SL2_end lab_SL2_2: .; RwRxRyRz = RaRbRcRd << (Rsh) (64<= Shift <96) addic &rsh,&rsh,-64 subfic &rs,&rsh,32 slw &rw,&rc,&rsh srw &rtp,&rd,&rs or &rw,&rtp,&rw slw &rx,&rd,&rsh xor &ry,&ry,&ry xor &rz,&rz,&rz b lab_SL2_end ENDIF lab_SL2_3: .; RwRxRyRz = RaRbRcRd << (Rsh) (Shift >=96) addic &rs,&rsh,-96 xor &rx,&rx,&rx xor &ry,&ry,&ry slw &rw,&rd,&rs xor &rz,&rz,&rz lab_SL2_end: ENDM # For test writing into source registers csect div128_96[DS]# Transition vector for div128_96 dc.l .div128_96[PR] # Address of div128_96 dc.l TOC[tc0] # TOC address of div128_96 # Non volatile registers used : r31, r30 CTR (+SP) Algorithm 0 numGPRs: set 4 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .div128_96[PR] # Code section for div128_96 # Parameters are # Address of quad word dividend # Address of triple word divisor # Address of quotient (quad word) # Address of remainder (triple word) stw r5,-4(r1) stw r6,-8(r1) # Load quad word dividend argument from memory: R5,R6,R7,R8 lwz r8,12(r3) lwz r7,8(r3) lwz r6,4(r3) lwz r5,0(r3) # Load divisor from memory: R10,R11,R12 lwz r12,8(r4) lwz r11,4(r4) lwz r10,0(r4) # Algorithm #---------- # (R5:R6:R7:R8) = (R5:R6:R7:R8) / (R10:R11:R12) (128b) = (128b / 96b) # quo dvd dvs # # Remainder is returned in R:R. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw1 = least-significant (middle-order) word, i.e. bits 64..96 # lsw = least-significant (low-order) word, i.e. bits 97..128 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R5:R6:R7:R8 = dvd (input dividend) quo (output quotient) # R10:R11:R12 = dvs (input divisor) rem (output remainder) # # R11:R12:R13:R14 = tmp # Algorithm uses non volatile registers R28 to R31 # Save them stw r28,-12(r1) stw r29,-16(r1) stw r30,-20(r1) stw r31,-24(r1) stw r26,-28(r1) stw r27,-32(r1) # Determine shift # Count the number of leading 0 in the dividend (r0) cmpi cr7,r5,0 cntlzw r0,r5 cntlzw r9,r6 bne cr7,lab_128_96_dvd addi r0,r9,32 cmpi cr7,r6,0 bne cr7,lab_128_96_dvd cntlzw r9,r7 addi r0,r9,64 cmpi cr7,r7,0 bne cr7,lab_128_96_dvd cntlzw r9,r8 addic r0,r9,96 lab_128_96_dvd: # Count the number of leading 0s in the divisor (r9) cmpi cr6,r10,0 cntlzw r9,r10 cntlzw r4,r11 bne cr6,lab_128_96_dvs addi r9,r4,32 cmpi cr6,r11,0 bne cr6,lab_128_96_dvs cntlzw r4,r12 addic r9,r4,64 lab_128_96_dvs: # Calculate how many bits the dividend must be shifted addi r9,r9,32 # Extend to 128 bits cmpw cr7,r0,r9 addic r9,r9,1 subf r0,r0,r9 bgt cr7, lab_128_96_end mtctr r0 subfic r9, r0,128 # Perform right shifting; the target rgisters are different from te source registers SHIFT_RIGHT r5,r6,r7,r8,r28,r29,r30,r31,r26,r27,r0 # Perform left shifting: target and source registers are the same ShiftLeft r5,r6,r7,r8,r26,r27,r9 lab128_96_11: # restoring division shift and subtract loop li r3,-1 # R3 = -1 addic R7,R7,0 # clear carry bit before loop starts li r4,0 lab128_96_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde r8,r8,r8 # shift dvd.lsw4 left 1 bit adde r7,r7,r7 # shift dvd.lsw3 left 1 bit adde r6,r6,r6 # shift dvd.lsw2 left 1 bit adde r5,r5,r5 # shift dvd.lsw1 to left 1 bit adde r31,r31,r31 # shift tmp.msw4 to left 1 bit adde r30,r30,r30 # shift tmp.msw3 to left 1 bit adde r29,r29,r29 # shift tmp.msw2 to left 1 bit # r28 will be 0, but might get a carry into its lowest bit! adde r28,r28,r28 # shift tmp.msw1 to left 1 bit subfc r0,r12,r31 # tmp.lsw - dvs.lsw subfe r9,r11,r30 # tmp.lsw2 - dvs.sw subfe r26,r10,r29 # tmp.msw2 - dvs.msw # In case a bit is carried into the higher word. Just take this possible # carry into account, by subtracting 0 from R10. subfe. r27,r4,r28 # tmp.msw - 0 + carry blt cr0,lab128_96_13 # if(result< 0) clear carry bit mr r31,r0 # move lsw mr r30,r9 # move sw mr r29,r26 # move msw # r will then be 0 again mr r28,r27 # or li r,0 # Only lsb might be set addic r0,r3,1 # set carry bit lab128_96_13: bdnz lab128_96_12 # write quotient and remainder adde r8,r8,r8 # quo.lsw (lsb = CA) adde r7,r7,r7 # quo.sw_l (lsb = CA) adde r6,r6,r6 # quo.sw_u (lsb = CA) adde r5,r5,r5 # quo.msw (lsb from lsw) # Write results back into memory # Quotient lwz r26,-4(r1) stw r5,0(r26) stw r6,4(r26) stw r7,8(r26) stw r8,12(r26) # Remainder lwz r27,-8(r1) stw r29,0(r27) stw r30,4(r27) stw r31,8(r27) # Restore non volatile registers lwz r28,-12(r1) lwz r29,-16(r1) lwz r30,-20(r1) lwz r31,-24(r1) lwz r26,-28(r1) lwz r27,-32(r1) blr lab_128_96_end: # Dividend has lss significand bits than divisor # Quotient is 0 xor r10,r10,r10 lwz r12,-4(r1) stw r10,0(r12) stw r10,4(r12) stw r10,8(r12) stw r10,12(r12) # remainder is dividend (low 3 words) lwz r11,-8(r1) stw r6,4(r11) stw r7,8(r11) stw r8,12(r11) blr end