#***** # # Michael Riff # Multiprecision version 1.0 Februar 2022. # Test code for the bits shifting of the divisions. # #****/ dialect PowerPC file 'test.s' # Only the sections can be exported (dos not work with labels) # export Test_Add[DS] export .Test_Add[PR] => 'test_add' # export Test_Subt[DS] export .Test_Subt[PR] => 'test_sub' # export test642[DS] export .test642[PR] => 'test_64_2' # export test96[DS] export .test96[PR] => 'test_96' # export test128[DS] export .test128[PR] => 'test_128' # TOC and transition vectors only necessary if cross TOC calls are implemented (separate code fragments) toc test_64: tc test642[TC],test642[DS] linkageArea: equ 24 calleesParams: set 0 calleesLocalVars: set 0 numFPRs: set 0 # Tests to check the setting of overflow and carry bits in XER # ============================================================= csect Test_Add[DS] csect .Test_Add[PR] # Reset OFW in XER addis r10,r0,0x3FFF addi r10,r10,0xFFFF mfspr r0,XER and r0,r0,r10 mtspr xer,r0 addco r3,r3,r4 # Handle overflow (and carry) mfspr r6,XER andis. r6,r6,0xE000 stw r6,0(r5) blr csect Test_Subt[DS] csect .Test_Subt[PR] # Reset OFW in XER addis r10,r0,0x3FFF addi r10,r10,0xFFFF mfspr r0,XER and r0,r0,r10 mtspr xer,r0 subfco r3,r4,r3 # Handle overflow (and carry) mfspr r6,XER andis. r6,r6,0xE000 stw r6,0(r5) blr # Tests to check the shifting over more than 1 register as used in the multiprecision divisions # ============================================================================================= # For 64 bits dividend, alternative csect test642[DS] # Non volatile registers used : r31, r30 CTR (+SP) Algorithm 0 numGPRs: set 4 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .test642[PR] # Code section for div642 # Save used non volatile registers stw r30,-4(r1) stw r31,-8(r1) mfctr r30 # ctr volatile? stw r30,-12(SP) stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # R5 is now free. Load divisor into R5 & R6 lwz r6,4(r4) lwz r5,0(r4) # R4 is now free. Load dividend into R3 & R4 lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # count the number of leading 0s in the dividend cmpwi cr0,R3,0 # dvd.msw == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ cntlzw R9,R4 # R9 = dvd.lsw.LZ bne cr0,lab2_1 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ addi R0,R9,32 # dvd.LZ = dvd.lsw.LZ + 32 lab2_1: # count the number of leading 0s in the divisor cmpwi cr0,R5,0 # dvd.msw == 0? cntlzw R9,R5 # R9 = dvs.msw.LZ cntlzw R10,R6 # R10 = dvs.lsw.LZ bne cr0,lab2_2 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R9,R10,32 # dvs.LZ = dvs.lsw.LZ + 32 lab2_2: # determine shift amounts to minimize the number of iterations cmpw cr0,R0,R9 # compare dvd.LZ to dvs.LZ subfic R10,R0,64 # R10 = dvd.SD bgt cr0,lab2_9 # if(dvs > dvd) quotient = 0 subfic R9,R9,64 # R9 = dvs.SD subf R9,R9,R10 # (dvd.SD - dvs.SD) = left shift of dvsr for mtctr R9 # number of iterations = dvd.SD - dvs.SD # R5:R6 = R5:R6 << R9 addi R0,R9,-32 blt cr0,lab2_3 # if(R9 < 32) jump to lab3 slw R5,R6,R0 # R5 = R6 << R9 - 32 li R6,0 b lab2_6 lab2_3: slw R5,R5,R9 # R5 << R9 subfic R0,R9,32 # R9 = 32 - R9 srw R10,R6,R0 # Save upper R9 bits of R6 slw R6,R6,R9 # R6 << R9 or R5,R5,R10 # Copy upper R9 bits from R6 into R5 lab2_6: mr R7,R3 mr R8,R4 li R3,0 li R4,0 lab2_7: # R5:R6 = R5:R6 >> 1 rlwnm R9,R5,31,0,0 # Save low order bit of R5 into high bit of R9 srwi R5,R5,1 srwi R6,R6,1 or R6,R6,R9 # Store low order bit of R5 into R6 bdnz lab2_7 b finish64_2 lab2_9: # Quotient is 0 (dvs > dvd) mr R6,R4 # rmd.lsw = dvd.lsw mr R5,R3 # rmd.msw = dvd.msw li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 #blr# return finish64_2: # Write results back into memory (Shifted Divis+Shift) stw r5,0(r30) stw r6,4(r30) stw r9,0(r31) stw r0,4(r31) # Restore non volatile registers # lwz SP,0(SP) lwz r30,spaceToSaveAligned-12(SP) mtctr r30 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr csect test96[DS] # This algorithm uses non volatile registers R31, R30 and R13, R14, R15 (pure algorithm) CTR? # and CR fields 2,3 numGPRs: set 7 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .test96[PR] # Code section for test96 # Save used non volatile registers stw R30,-4(R1) stw R31,-8(R1) mfctr r31 stw r31,-12(SP) mfcr r30 # stw r30,36(SP) stw r30,4(SP) # Condition register may be saved in linkage area stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # TEST ONLY # Keep reference on dividend (instead of quotient) # Store addresses of results mr r31,r3 # Address of dividend!!! Instead of remainder # END # R6 is now free. Load divisor into R6 & R7 lwz r6,0(r4) lwz r7,4(r4) # R4 is now free. Load dividend into R3, R4 R5 lwz r5,8(r3) lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # (R3:R4:R5) = (R3:R4:R5) / (R6:R7) (96b) = (96b / 64b) # quo dvd dvs # # Remainder is returned in R6:R7. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw = least-significant (low-order) word, i.e. bits 64..96 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4:R5 = dvd (input dividend); quo (output quotient) # R6:R7 = dvs (input divisor); rem (output remainder) # # R8:R9:R10 = tmp # Algorithm uses non volatile registers R13 to R15 # Save them stw R13,-4(R1) stw R14,-8(R1) stw R15,-12(R1) # Option use a multiple word store (remplacer R11->R29 .. R15->R25) # stmw R27,0(R1) # count the number of leading 0s in the divisor (R11) cmpwi cr0,R6,0 # dvd.msw == 0? cntlzw R11,R6 # R11 = dvs.msw.LZ cntlzw R12,R7 # R12 = dvs.lsw.LZ bne cr0,lab96_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R11,R12,32 # dvs.LZ = dvs.lsw.LZ + 32 lab96_1: addi r11,r11,32 # Extend divisor to 3 words # count the number of leading 0s in the dividend (R0) cmpwi cr0,R3,0 # dvd.msw == 0? cmpwi cr2,R4,0 # dvd.msw2 == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ bne cr0,lab96_2 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ # Dividend has 32 <= LZ cntlzw R12,R4 # R12 = dvd.lsw.LZ addi R0,R12,32 # dvd.LZ = dvd.lsw.LZ + 32 bne cr2,lab96_2 # if(dvd.msw2 != 0) dvd.LZ = 32+dvd.msw2.LZ # Dividend has 64 <= LZ cntlzw R12,R5 # R9 = dvd.lsw.LZ addi R0,R12,64 # dvd.LZ = dvd.lsw.LZ + 64 lab96_2: # determine shift amounts to minimize the number of iterations cmpw cr0,R0,R11 # compare dvd.LZ to dvs.LZ subfic R12,R0,96 # R12 = dvd.SD bgt cr0,lab96_20 # if(dvs > dvd) quotient = 0 addi R11,R11,1 # ++dvs.LZ (or --dvs.SD) subfic R11,R11,96 # R11 = dvs.SD add R0,R0,R11 # (dvd.LZ + dvs.SD) = left shift of dvd for # initialdvd # Perform comparisons for comming algor here cmpwi cr4,R0,32 # compare R0 to 32 cmpwi cr5,R0,64 # compare R0 to 64 subf R11,R11,R12 # (dvd.SD - dvs.SD) = right shift of dvd for # initialtmp cmpwi cr6,R11,32 # compare R11 to 32 cmpwi cr7,R11,64 # compare R11 to 64 mtctr R11 # number of iterations = dvd.SD - dvs.SD # TEST ONLY # write Shifts into low quotient # mr r8,r3 # mr r9,r4 # stw r0,4(r30) # stw r11,8(r30) # mr r10,r5 # b Test_96 # END # R8:R9:R10 = R3:R4:R5 >> R11 # From here we distinguish 3 cases: # Shift < 32: every register has to be shifted right by R11, and the R11 lower bits of the upper transferred. # 32 <= Shift < 64: the lowest register pushed out. The data to be shifted by R11-32 jumping one register. # otherwise, the 2 lower registers pushed out. Only R11-64 upper bits from the highest register to be put into the lowest. # The right shift of the 3 other registers is dependant on the left shift # Shift left >= 64 bits # Shift left 32 excl. to 64 incl. # Shift left <= 32 bits blt cr6,lab96_4 # if(R11 < 32) jump to lab4 blt cr7,lab96_5 # if(R11 < 64) jump to lab5 b lab96_6 # R11 >= 64 lab96_4: # First case srw R10,R5,R11 # R10 = dvd.lsw >> R11 subfic R12,R11,32 slw R13,R4,R12 # R13 = dvd.msw1 << 32 - R11 or R10,R10,R13 # tmp.lsw = R13 | R10 srw R9,R4,R11 # R9 = dvd.msw1 >> R11 slw R13,R3,R12 # R13 = dvd.msw << 32 - R11 or R9,R9,R13 # tmp.msw1 = R13 | R9 srw R8,R3,R11 # tmp.msw = dvd.msw >> R11 b lab96_7 lab96_5: # Second case subfic R12,R11,64 # R13 = 64 - R11 addi R11,R11,-32 # R11 = R11 - 32 srw R10,R4,R11 # R10 = dvd.msw1 >> R11 - 32 slw R13,R3,R12 # R14 = dvd.msw << 64 - R11 or R10,R13,R10 # tmp.lsw = R14 | R10 srw R9,R3,R11 # tmp.msw1 = dvd.msv >> R11 - 32 li R8,0 b lab96_7 lab96_6: # Third case addi R12,R11,-64 srw R10,R3,R12 # R10 = dvd.msw1 >> R11 - 64 li R8,0 li R9,0 lab96_7: # R3:R4:R5 = R3:R4:R5 << R0 # same 3 cases as before blt cr4,lab96_8 # if(R0 < 32) jump to lab8 blt cr5,lab96_9 # if(R0 < 64) jump to lab9 b lab96_10 # R0 >= 64 lab96_8: # Shift left R3:R4:R5 by less than 32 bits! slw R3,R3,R0 # R3 = dvd.msw << R0 subfic R11,R0,32 srw R12,R4,R11 # R12 = dvd.msw1 >> 32 - R0 or R3,R3,R12 # dvd.msw = R3 | R12 slw R4,R4,R0 # dvd.msw1 = dvd.msw1 << R0 srw R12,R5,R11 # R12 = dvd.lsw >> 32 - R0 or R4,R4,R12 # dvd.msw1 = R4 | R12 slw R5,R5,R0 # dvd.lsw = dvd.lsw << R0 b lab96_11 lab96_9: # Shift left R3:R4:R5 by 32 incl. to 64 excl. bits! subfic R13,R0,64 addic R11,R0,-32 # Not use addi!!! => R11=-32 slw R3,R4,R11 # R3 = dvd.lsw << R0 - 32 srw R12,R5,R13 # R12 = dvd.lsw >> 64 - R0 or R3,R3,R12 # R3 = R3 | R12 slw R4,R5,R11 # R4 = R5 << R0 - 32 li R5,0 b lab96_11 lab96_10: # Shift left R3:R4:R5 by more than 64 bits! addic R13,R0,-64 # Not use addi!!! 0=> R13=-64 slw R3,R5,R13 li R5,0 li R4,0 lab96_11: # TEST ONLY # Jump over division # b Test_96 # END # restoring division shift and subtract loop li R11,-1 # R11 = -1 addic R7,R7,0 # clear carry bit before loop starts lab96_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde R5,R5,R5 # shift tmp.lsw3 left 1 bit adde R4,R4,R4 # shift tmp.lsw2 left 1 bit adde R3,R3,R3 # shift tmp.lsw1 to left 1 bit adde R10,R10,R10 # shift tmp.msw3 to left 1 bit adde R9,R9,R9 # shift tmp.msw2 to left 1 bit adde R8,R8,R8 # shift tmp.msw1 to left 1 bit # TEST ONLY: JUST SHIFT # subfc R0,R7,R10 # tmp.lsw - dvs.lsw # subfe. R12,R6,R9 # tmp.msw - dvs.msw # blt cr0,lab96_13 # if(result< 0) clear carry bit # mr R10,R0 # move lsw # mr R9,R12 # move msw # addic R0,R11,1 # set carry bit lab96_13: bdnz lab96_12 # write quotient and remainder # TEST ONLY: JUST SHIFT # adde R5,R5,R5 # quo.lsw (lsb = CA) # adde R4,R4,R4 # quo.msw1 (lsb = CA) # adde R3,R3,R3 # quo.msw (lsb from lsw) # mr R7,R10 # rem.lsw # mr R6,R9 # rem.msw #blr # return b finish_96 lab96_20: # Quotient is 0 (dvs > dvd) mr R7,R5 # rmd.lsw = dvd.lsw mr R6,R4 # rmd.msw = dvd.msw1 li R5,0 # dvd.msw = 0 li R4,0 # dvd.lsw = 0 li R3,0 # dvd.msw = 0 #blr# return finish_96: # Restore non volatile registers lwz R13,-4(R1) lwz R14,-8(R1) lwz R15,-12(R1) # Option use a multiple word store (remplacer R11->R31 .. R15->R27 # lmw R27,0(R1) # Write results back into memory stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) stw r6,0(r31) stw r7,4(r31) # TEST ONLY Test_96: # Restore non volatile registers lwz R13,-4(R1) lwz R14,-8(R1) lwz R15,-12(R1) # write result in output structures (Shifted dividend) stw r8,0(r31) stw r9,4(r31) stw r10,8(r31) stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) # END # Restore non volatile registers # lwz SP,0(SP) lwz r31,spaceToSaveAligned+4(SP) mtcrf 2,r31 mtcrf 4,r31 lwz r30,spaceToSaveAligned-12(SP) mtctr r30 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr # For 128 bits dividend csect test128[DS] # This algorithm uses non volatile registers R31, R30 and R29, R28, R27 (pure algorithm) # and CR fields 2,3 numGPRs: set 6 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .test128[PR] # Code section for test128 # Save used non volatile registers stw R30,-4(R1) stw R31,-8(R1) mfcr r30 stw r30,4(SP) # Condition register may be saved in linkage area stwu SP, -spaceToSaveAligned(SP) # Optional as leaf routine # Store addresses of results mr r31,r6 # Address of remainder mr r30,r5 # Address of quotient # TEST ONLY # Keep reference on dividend (instead of quotient) # Store addresses of results mr r31,r3 # Address of dividend!!! Instead of remainder # END # Load divisor into R7 & R8 lwz r7,0(r4) lwz r8,4(r4) # R6 & R4 are now free. Load dividend into R3, R4 R5 R6 lwz r6,12(r3) lwz r5,8(r3) lwz r4,4(r3) lwz r3,0(r3) # Algorithm #---------- # (R3:R4:R5:R6) = (R3:R4:R5:R6) / (R7:R8) (128b) = (128b / 64b) # quo dvd dvs # # Remainder is returned in R7:R8. # # Code comment notation: # msw = most-significant (high-order) word, i.e. bits 0..31 # msw2 = most-significant (middle-order) word, i.e. bits 32..63 # lsw1 = least-significant (low-order) word, i.e. bits 64..96 # lsw = least-significant (low-order) word, i.e. bits 97..128 or 32..63 for the 64 bits divisor # LZ = Leading Zeroes # SD = Significant Digits # # R3:R4:R5:R6 = dvd (input dividend); quo (output quotient) # R7:R8 = dvs (input divisor); rem (output remainder) # # R9:R10:R11:R12 = tmp # Algorithm uses non volatile registers R27 to R31 # Save them stw R27,-4(R1) stw R28,-8(R1) stw R29,-12(R1) stw R30,-16(R1) stw R31,-18(R1) # count the number of leading 0s in the divisor (R29) cmpwi cr0,R7,0 # dvd.msw == 0? cntlzw R29,R7 # R29 = dvs.msw.LZ cntlzw R12,R8 # R12 = dvs.lsw.LZ bne cr0,lab128_1 # if(dvs.msw != 0) dvs.LZ = dvs.msw.LZ addi R29,R12,32 # dvs.LZ = dvs.lsw.LZ + 32 lab128_1: addi r29,r29,64 # Extend divisor to 4 words # count the number of leading 0s in the dividend (R0) cmpwi cr0,R3,0 # dvd.msw == 0? cmpwi cr2,R4,0 # dvd.msw2 == 0? cntlzw R0,R3 # R0 = dvd.msw.LZ bne cr0,lab128_2 # if(dvd.msw != 0) dvd.LZ = dvd.msw.LZ # Dividend has 32 <= LZ cmpwi cr3,R5,0 # dvd.lsw1 == 0? cntlzw R12,R4 # R12 = dvd.lsw.LZ addi R0,R12,32 # dvd.LZ = dvd.lsw.LZ + 32 bne cr2,lab128_2 # if(dvd.msw2 != 0) dvd.LZ = 32+dvd.msw2.LZ # Dividend has 64 <= LZ cntlzw R12,R5 # R12 = dvd.lsw.LZ addi R0,R12,64 # dvd.LZ = dvd.lsw.LZ + 64 bne cr3,lab128_2 # if(dvd.lsw1 != 0) dvd.LZ = 64+dvd.lsw1.LZ # Dividend has LZ >= 96 cntlzw R12,R6 # R12 = dvd.lsw.LZ addi R0,R12,96 # dvd.LZ = dvd.lsw.LZ + 96 lab128_2: # determine shift amounts to minimize the number of iterations cmpw cr0,R0,R29 # compare dvd.LZ to dvs.LZ subfic R10,R0,128 # R10 = dvd.SD bgt cr0,lab128_14 # if(dvs > dvd) quotient = 0 addi R29,R29,1 # ++dvs.LZ (or --dvs.SD) subfic R29,R29,128 # R9 = dvs.SD add R0,R0,R29 # (dvd.LZ + dvs.SD) = left shift of dvd for # initialdvd subf R29,R29,R10 # (dvd.SD - dvs.SD) = right shift of dvd for # initialtmp mtctr R29 # number of iterations = dvd.SD - dvs.SD # TEST ONLY # write Shifts into low quotient # mr r9,r3 # mr r10,r4 # mr r11,r5 # mr r12,r6 # stw r0,8(r30) # stw r29,12(r30) # b Test_128 # END # R9:R10:R11:R12 = R3:R4:R5:R6 >> r29 # Shifting right 4 registers wxyz by r29 #--------------------------------------- cmpwi cr7,r29,96 # compare r29 to 96 cmpwi cr6,r29,64 # compare r29 to 64 cmpwi cr5,r29,32 # compare r29 to 32 bge cr7,labSR3 bge cr6,labSR2 bge cr5,labSR1 # Shift < 32 bits srw r12,r6,r29 subfic r28,r29,32 # r28 = 32 - r29 slw r27,r5,r28 or r12,r27,r12 srw r11,r5,r29 slw r27,r4,r28 or r11,r27,r11 srw r10,r4,r29 slw r27,r3,r28 or r10,r27,r10 srw r9,r3,r29 b lab128_7 labSR1: # 32 <= Shift < 64 subfic r28,r29,64 # r28 = 64 - r29 addi r29,r29,-32 # r29 = r29 - 32 srw r12,r5,r29 slw r27,r4,r28 or r12,r27,r12 srw r11,r4,r29 slw r27,r3,r28 or r11,r27,r11 srw r10,r3,r29 li r9,0 b lab128_7 labSR2: # 64 <= Shift < 96 subfic r28,r29,96 # r28 = 96 - r29 addi r29,r29,-64 # r29 = r29 - 64 srw r12,r4,r29 slw r27,r3,r28 or r12,r27,r12 srw r11,r3,r29 li R10,0 li R9,0 b lab128_7 labSR3: # 96 <= Shift addi r29,r29,-96 # r29 = r29 - 96 srw r12,r3,r29 li r11,0 li r10,0 li r9,0 lab128_7: # R3:R4:R5:R6 = R3:R4:R5:R6 << R0 # Shifting left 4 registers wxyz by R0 #--------------------------------------- cmpwi cr7,R0,96 # compare R0 to 96 cmpwi cr6,R0,64 # compare R0 to 64 cmpwi cr5,R0,32 # compare R0 to 32 bge cr7,labSL3 bge cr6,labSL2 bge cr5,labSL1 # Shift < 32 bits slw r3,r3,r0 subfic r28,r0,32 # r28 = 32 - r0 srw r27,r4,r28 or r3,r3,r27 slw r4,r4,r0 srw r27,r5,r28 or r4,r4,r27 slw r5,r5,r0 srw r27,r6,r28 or r5,r5,r27 slw r6,r6,r0 b lab128_11 labSL1: # 32 <= Shift < 64 subfic r28,r0,64 # r28 = 64 - r0 addic r0,r0,-32 # r0 = r0 - 32 Not use addi!!! 0=> r0=-64 slw r3,r4,r0 srw r27,r5,r28 or r3,r3,r27 slw r4,r5,r0 srw r27,r6,r28 or r4,r4,r27 slw r5,r6,r0 li r6,0 b lab128_11 labSL2: # 64 <= Shift < 96 subfic r28,r0,96 # r28 = 96 - r0 addic r0,r0,-64 # r0 = r0 - 64 Not use addi!!! 0=> r0=-64 slw r3,r5,R0 srw r27,r6,r28 or r3,r3,r27 slw r4,r6,R0 li r5,0 li r6,0 b lab128_11 labSL3: # 96 <= Shift addic r0,r0,-96 # r0 = r0 - 96 Not use addi!!! 0=> r0=-96 slw r3,r6,r0 li R4,0 li R5,0 li R6,0 lab128_11: addic R7,R7,0 # clear carry bit before loop starts lab128_12: # tmp:dvd is considered one large register # each portion is shifted left 1 bit by adding it to itself # adde sums the carry from the previous and creates a new carry adde R6,R6,R6 # shift dvd.lsw4 left 1 bit adde R5,R5,R5 # shift dvd.lsw3 left 1 bit adde R4,R4,R4 # shift dvd.lsw2 left 1 bit adde R3,R3,R3 # shift dvd.lsw1 to left 1 bit adde R12,R12,R12 # shift tmp.msw4 to left 1 bit adde R11,R11,R11 # shift tmp.msw3 to left 1 bit adde R10,R10,R10 # shift tmp.msw2 to left 1 bit adde R9,R9,R9 # shift tmp.msw1 to left 1 bit lab128_13: bdnz lab128_12 lab128_14: finish_128: # Write results back into memory # TEST ONLY Test_128: # write result in output structures stw r9,0(r31) stw r10,4(r31) stw r11,8(r31) stw r12,12(r31) stw r3,0(r30) stw r4,4(r30) stw r5,8(r30) stw r6,12(r30) # END # Restore non vol registers lwz R27,-4(R1) lwz R28,-8(R1) lwz R29,-12(R1) lwz R30,-16(R1) lwz R31,-18(R1) # Restore non volatile registers lwz r31,spaceToSaveAligned+4(SP) mtcrf 2,r31 mtcrf 3,r31 lwz r30,spaceToSaveAligned-4(r1) lwz r31,spaceToSaveAligned-8(r1) addi SP,SP,spaceToSaveAligned # Optional as leaf routine blr end