dialect PowerPC file 'shifttest.s' # Only the sections can be exported (dos not work with labels) # export shift_l1[DS] export .shift_l1[PR] => 'shift_l1' # export shift_l2[DS] export .shift_l2[PR] => 'shift_l2' # export shift_r1[DS] export .shift_r1[PR] => 'shift_r1' # export shift_r2[DS] export .shift_r2[PR] => 'shift_r2' # TOC and transition vectors only necessary if cross TOC calls are implemented (separate code fragments) toc shift_l1: tc shift_l1[TC],shift_l1[DS] shift_l2: tc shift_l2[TC],shift_l2[DS] shift_r1: tc shift_r1[TC], shift_r1[DS] shift_r2: tc shift_r2[TC], shift_r2[DS] linkageArea: equ 24 calleesParams: set 0 calleesLocalVars: set 0 numFPRs: set 0 # # 4 Register shifting macro implementation: # Shift is range [0;128[ # Shifting right 4 registers wxyz # Source and target registers are the same # Uses R10 and R11 and modifies &Rnum ! #--------------------------------------- # Shifting right 4 registers wxyz with Rnum. # R10, R11 used for temp data MACRO ShiftRight &Rw,&Rx,&Ry,&Rz,&Rnum cmpwi cr7,&Rnum,96 # compare Rnum to 96 cmpwi cr6,&Rnum,64 # compare Rnum to 64 cmpwi cr5,&Rnum,32 # compare Rnum to 32 bge cr7,labSR3 bge cr6,labSR2 bge cr5,labSR1 .; Shift < 32 bits srw &Rz,&Rz,&Rnum subfic R10,&Rnum,32 # r10 = 32 - &Rnum slw R11,&Ry,R10 or &Rz,R11,&Rz srw &Ry,&Ry,&Rnum slw R11,&Rx,R10 or &Ry,R11,&Ry srw &Rx,&Rx,&Rnum slw R11,&Rw,R10 or &Rx,R11,&Rx srw &Rw,&Rw,&Rnum b labSR_end labSR1: .; 32 <= Shift < 64 subfic R10,&Rnum,64 # r10 = 64 - &Rnum addic &Rnum,&Rnum,-32 # &Rnum = &Rnum - 32 srw &Rz,&Ry,&Rnum slw R11,&Rx,R10 or &Rz,R11,&Rz srw &Ry,&Rx,&Rnum slw R11,&Rw,R10 or &Ry,R11,&Ry srw &Rx,&Rw,&Rnum li &Rw,0 b labSR_end labSR2: .; 64 <= Shift < 96 subfic R10,&Rnum,96 # r10 = 96 - &Rnum addic &Rnum,&Rnum,-64 # &Rnum = &Rnum - 64 srw &Rz,&Rx,&Rnum slw R11,&Rw,R10 or &Rz,R11,&Rz srw &Ry,&Rw,&Rnum li &Rx,0 li &Rw,0 b labSR_end labSR3: .; 96 <= Shift addic &Rnum,&Rnum,-96 # &Rnum = &Rnum - 96 li &Ry,0 li &Rx,0 srw &Rz,&Rw,&Rnum li &Rw,0 labSR_end: ENDM # Shifting left 4 registers wxyz # Source and target registers are the same # Uses R10 and R11 and modifies &Rnum ! #--------------------------------------- # Shifting left 4 registers wxyz MACRO ShiftLeft &Rw,&Rx,&Ry,&Rz,&Rnum cmpwi cr7,&Rnum,96 # compare R9 to 96 cmpwi cr6,&Rnum,64 # compare R9 to 64 cmpwi cr5,&Rnum,32 # compare R9 to 32 bge cr7,labSL3 bge cr6,labSL2 bge cr5,labSL1 .; Shift < 32 bits slw &Rw,&Rw,&Rnum subfic R10,&Rnum,32 # r10 = 32 - &Rnum srw R11,&Rx,R10 or &Rw,&Rw,R11 slw &Rx,&Rx,&Rnum srw R11,&Ry,R10 or &Rx,&Rx,R11 slw &Ry,&Ry,&Rnum srw R11,&Rz,R10 or &Ry,&Ry,R11 slw &Rz,&Rz,&Rnum b labSL_end labSL1: .; 32 <= Shift < 64 subfic R10,&Rnum,64 # r10 = 64 - &Rnum addic &Rnum,&Rnum,-32 # &Rnum = &Rnum - 32 slw &Rw,&Rx,&Rnum srw R11,&Ry,R10 or &Rw,&Rw,R11 slw &Rx,&Ry,&Rnum srw R11,&Rz,R10 or &Rx,&Rx,R11 slw &Ry,&Rz,&Rnum li &Rz,0 b labSL_end labSL2: .; 64 <= Shift < 96 subfic R10,&Rnum,96 # r10 = 96 - &Rnum addic &Rnum,&Rnum,-64 # &Rnum = &Rnum - 64 slw &Rw,&Ry,&Rnum srw R11,&Rz,R10 or &Rw,&Rw,R11 slw &Rx,&Rz,&Rnum li &Ry,0 li &Rz,0 b labSL_end labSL3: .; 96 <= Shift addic &Rnum,&Rnum,-96 # &Rnum = &Rnum - 96 li &Rx,0 li &Ry,0 slw &Rw,&Rz,&Rnum li &Rz,0 labSL_end: ENDM # Shifting right 4 registers abcd # Source and target registers are different #----------------------------------------- # Arguments ra,rb,rc,rd,ri,rj,rk,rl,r?,rs,rsh. Additionally to ri-rl modifies r?, rs and rsh! # we may have ra,rb,rc,rd = ri,rj,rk,rl if switch ORDER_OPTIMISATION is not defined! # Definig this switch allows reordering instructions # 4 registers have to be shifted right by Rsh bits # We have 4 cases right shift <32, >=32 & <64, >=64 & <96 and >=96 MACRO SHIFT_RIGHT &ra,&rb,&rc,&rd,&ri,&rj,&rk,&rl,&rtp,&rs,&rsh cmpwi cr1,&rsh,32 cmpwi cr5,&rsh,64 cmpwi cr6,&rsh,96 blt cr1, lab_SR2_3 blt cr5, lab_SR2_2 blt cr6, lab_SR2_1 IFDEF ORDER_OPTIMISATION ; Here ra,rb,rc,rd = ri,rj,rk,rl is not allowed .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=96) addic &rs,&rsh,-96 xor &ri,&ri,&ri xor &rj,&rj,&rj xor &rk,&rk,&rk srw &rl,&ra,&rs b labSR2_end lab_SR2_1: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=64) addic &rs,&rsh,-64 xor &ri,&ri,&ri subfic &rsh,&rsh,96 xor &rj,&rj,&rj srw &rl,&rb,&rs slw &rtp,&ra,&rsh or &rl,&rtp,&rl srw &rk,&ra,&rs b labSR2_end lab_SR2_2: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=32) addic &rs,&rsh,-32 xor &ri,&ri,&ri subfic &rsh,&rsh,64 srw &rl,&rc,&rs slw &rtp,&rb,&rsh or &rl,&rtp,&rl srw &rk,&rb,&rs slw &rtp,&ra,&rsh or &rk, &rtp,&rk srw &rj,&ra,&rs b labSR2_end ELSE .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=96) addic &rs,&rsh,-96 xor &rj,&rj,&rj xor &rk,&rk,&rk srw &rl,&ra,&rs xor &ri,&ri,&ri b labSR2_end lab_SR2_1: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=64) addic &rs,&rsh,-64 subfic &rsh,&rsh,96 srw &rl,&rb,&rs xor &rj,&rj,&rj slw &rtp,&ra,&rsh or &rl,&rtp,&rl srw &rk,&ra,&rs xor &ri,&ri,&ri b labSR2_end lab_SR2_2: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift >=32) addic &rs,&rsh,-32 subfic &rsh,&rsh,64 srw &rl,&rc,&rs slw &rtp,&rb,&rsh or &rl,&rtp,&rl srw &rk,&rb,&rs slw &rtp,&ra,&rsh or &rk, &rtp,&rk srw &rj,&ra,&rs xor &ri,&ri,&ri b labSR2_end ENDIF lab_SR2_3: .; RiRjRkRl = RaRbRcRd >> Rsh (Shift <32) subfic &rs,&rsh,32 srw &rl,&rd,&rsh slw &rtp,&rc,&rs or &rl,&rtp,&rl srw &rk,&rc,&rsh slw &rtp,&rb,&rs or &rk,&rtp,&rk srw &rj,&rb,&rsh slw &rtp,&ra,&rs or &rj,&rtp,&rj srw &ri,&ra,&rsh labSR2_end: ENDM # Arguments ra,rb,rc,rd,rw,rx,ry,rz,r?,rs,rsh. Additionally to ri-rl modifies r?, rs and rsh! # we may have ra,rb,rc,rd = rw,rx,ry,rz if switch ORDER_OPTIMISATION is not defined! # Definig this switch allows reordering instructions # 4 registers have to be shifted left by Rsh bits # We have 4 cases left shift <32, >=32 & <64, >=64 & <96 and >=96 MACRO SHIFT_LEFT &ra,&rb,&rc,&rd,&rw,&rx,&ry,&rz,&rtp,&rs,&rsh cmpwi cr6,&rsh,96 cmpwi cr5,&rsh,64 cmpwi cr1,&rsh,32 bge cr6, lab_SL2_3 bge cr5, lab_SL2_2 bge cr1, lab_SL2_1 .; RwRxRyRz = RaRbRcRd << (Rsh) (Shift <32) subfic &rs,&rsh,32 slw &rw,&ra,&rsh srw &rtp,&rb,&rs or &rw,&rtp,&rw slw &rx,&rb,&rsh srw &rtp,&rc,&rs or &rx,&rtp,&rx slw &ry,&rc,&rsh srw &rtp,&rd,&rs or &ry,&rtp,&ry slw &rz,&rd,&rsh b labSL2_end IFDEF ORDER_OPTIMISATION ; Here ra,rb,rc,rd = ri,rj,rk,rl is not allowed lab_SL2_1: .; RwRxRyRz = RaRbRcRd << (Rsh) (32<= Shift <64) addic &rsh,&rsh,-32 subfic &rs,&rsh,32 slw &rw,&rb,&rsh srw &rtp,&rc,&rs or &rw,&rtp,&rw slw &rx,&rc,&rsh srw &rtp,&rd,&rs or &rx,&rtp,&rx slw &ry,&rd,&rsh xor &rz,&rz,&rz b labSL2_end lab_SL2_2: .; RwRxRyRz = RaRbRcRd << (Rsh) (64<= Shift <96) addic &rsh,&rsh,-64 subfic &rs,&rsh,32 slw &rw,&rc,&rsh srw &rtp,&rd,&rs or &rw,&rtp,&rw slw &rx,&rd,&rsh xor &ry,&ry,&ry xor &rz,&rz,&rz b labSL2_end ELSE lab_SL2_1: .; RwRxRyRz = RaRbRcRd << (Rsh) (32<= Shift <64) addic &rsh,&rsh,-32 subfic &rs,&rsh,32 slw &rw,&rb,&rsh srw &rtp,&rc,&rs or &rw,&rtp,&rw slw &rx,&rc,&rsh srw &rtp,&rd,&rs or &rx,&rtp,&rx slw &ry,&rd,&rsh xor &rz,&rz,&rz b labSL2_end lab_SL2_2: .; RwRxRyRz = RaRbRcRd << (Rsh) (64<= Shift <96) addic &rsh,&rsh,-64 subfic &rs,&rsh,32 slw &rw,&rc,&rsh srw &rtp,&rd,&rs or &rw,&rtp,&rw slw &rx,&rd,&rsh xor &ry,&ry,&ry xor &rz,&rz,&rz b labSL2_end ENDIF lab_SL2_3: .; RwRxRyRz = RaRbRcRd << (Rsh) (Shift >=96) addic &rs,&rsh,-96 xor &rx,&rx,&rx xor &ry,&ry,&ry slw &rw,&rd,&rs xor &rz,&rz,&rz labSL2_end: ENDM # For test writing into source registers csect shift_r1[DS]# Transition vector for shift_r1 dc.l .shift_r1[PR] # Address of shift_r1 dc.l TOC[tc0] # TOC address of shift_r1 # Non volatile registers used : r31, r30 CTR (+SP) Algorithm 0 numGPRs: set 4 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .shift_r1[PR] # Code section for shift_r1 # Parameters are # Address of quad word to shift # Address of divisor # Address to return shift value stw r5,-4(r1) # Load quad word dividend argument from memory lwz r8,12(r3) lwz r7,8(r3) lwz r6,4(r3) lwz r5,0(r3) # Load divisor from memory lwz r12,8(r4) lwz r11,4(r4) lwz r10,0(r4) # Determine shift # Count the number of leading 0 in the dividend (r0) cmpi cr7,r5,0 cntlzw r0,r5 cntlzw r9,r6 bne cr7,lab_sr_1_1 addi r0,r9,32 cmpi cr7,r6,0 bne cr7,lab_sr_1_1 cntlzw r9,r7 addi r0,r9,64 cmpi cr7,r7,0 bne cr7,lab_sr_1_1 cntlzw r9,r8 addic r0,r9,96 lab_sr_1_1: # Count the number of leading 0s in the divisor (r9) cmpi cr6,r10,0 cntlzw r9,r10 cntlzw r4,r11 bne cr6,lab_sr_1_2 addi r9,r4,32 cmpi cr6,r11,0 bne cr6,lab_sr_1_2 cntlzw r4,r12 addic r9,r4,64 lab_sr_1_2: # Calculate how many bits the dividend must be shifted addi r9,r9,32 # Extend to 128 bits cmpw cr7,r0,r9 # addic r9,r9,1 subf r0,r0,r9 bgt cr7, lab_sr1_end # Write out shift lwz r12,-4(r1) stw r0,0(r12) # Perform test shifting # ShiftRight Rw,Rx,Ry,Rz,Rnum ShiftRight R5,R6,R7,R8,R0 # Write results back into memory stw r5,0(r3) stw r6,4(r3) stw r7,8(r3) stw r8,12(r3) blr lab_sr1_end: blr # For test writing into source registers csect shift_l1[DS]# Transition vector for shift_l1 dc.l .shift_l1[PR] # Address of shift_l1 dc.l TOC[tc0] # TOC address of shift_l1 # Non volatile registers used : r31, r30 CTR (+SP) Algorithm 0 numGPRs: set 4 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .shift_l1[PR] # Code section for shift_l1 # Load quad word dividend argument from memory lwz r8,12(r3) lwz r7,8(r3) lwz r6,4(r3) lwz r5,0(r3) # Store shift mr r0,r4 # Perform test shifting # ShiftLeft Rw,Rx,Ry,Rz,Rnum ShiftLeft R5,R6,R7,R8,R0 # Write results back into memory stw r5,0(r3) stw r6,4(r3) stw r7,8(r3) stw r8,12(r3) blr lab_sl1_end: # xor r11,r11,r11 # stw r11,0(r3) blr # For test writing into source registers csect shift_r2[DS]# Transition vector for shift_r2 dc.l .shift_r2[PR] # Address of shift_r2 dc.l TOC[tc0] # TOC address of shift_r2 # Non volatile registers used : r31, r30 CTR (+SP) Algorithm 0 numGPRs: set 4 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .shift_r2[PR] # Code section for shift_r2 # Parameters are # Address of quad word to shift # Address of divisor # Address to return shift value stw r5,-4(r1) # Load quad word dividend argument from memory lwz r8,12(r3) lwz r7,8(r3) lwz r6,4(r3) lwz r5,0(r3) # Load divisor from memory lwz r12,8(r4) lwz r11,4(r4) lwz r10,0(r4) # Determine shift # Count the number of leading 0 in the dividend (r0) cmpi cr7,r5,0 cntlzw r0,r5 cntlzw r9,r6 bne cr7,lab_sr_2_1 addi r0,r9,32 cmpi cr7,r6,0 bne cr7,lab_sr_2_1 cntlzw r9,r7 addi r0,r9,64 cmpi cr7,r7,0 bne cr7,lab_sr_2_1 cntlzw r9,r8 addic r0,r9,96 lab_sr_2_1: # Count the number of leading 0s in the divisor (r9) cmpi cr6,r10,0 cntlzw r9,r10 cntlzw r4,r11 bne cr6,lab_sr_2_2 addi r9,r4,32 cmpi cr6,r11,0 bne cr6,lab_sr_2_2 cntlzw r4,r12 addic r9,r4,64 lab_sr_2_2: # Calculate how many bits the dividend must be shifted addi r9,r9,32 # Extend to 128 bits cmpw cr7,r0,r9 # addic r9,r9,1 subf r0,r0,r9 bgt cr7, lab_sr2_end # Write out shift lwz r12,-4(r1) stw r0,0(r12) # Perform test shifting # SHIFT_RIGHT ra,rb,rc,rd,ri,rj,rk,rl,r?,rs,rsh SHIFT_RIGHT r5,r6,r7,r8,r5,r6,r7,r8,r11,r12,r0 # SHIFT_RIGHT r5,r6,r7,r8,ri,rj,rk,rl,r11,r12,r0 # Write results back into memory stw r5,0(r3) stw r6,4(r3) stw r7,8(r3) stw r8,12(r3) blr lab_sr2_end: blr # For test writing into source registers csect shift_l2[DS]# Transition vector for shift_l2 dc.l .shift_l2[PR] # Address of shift_l2 dc.l TOC[tc0] # TOC address of shift_l2 # Non volatile registers used : r31, r30 CTR (+SP) Algorithm 0 numGPRs: set 4 spaceToSave: set linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs # Stack must be 16 (OSX) or 8 byte aligned # spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+15) & (-16)) spaceToSaveAligned set ((linkageArea + CalleesParams + CalleesLocalVars + 4*numGPRs + 8*numFPRs+7) & (-8)) csect .shift_l2[PR] # Code section for shift_l2 # Load quad word dividend argument from memory lwz r8,12(r3) lwz r7,8(r3) lwz r6,4(r3) lwz r5,0(r3) # Store shift mr r0,r4 # Perform test shifting # SHIFT_LEFT ra,rb,rc,rd,ri,rj,rk,rl,r?,rs,rsh SHIFT_LEFT r5,r6,r7,r8,r5,r6,r7,r8,r11,r12,r0 # SHIFT_LEFT r5,r6,r7,r8,ri,rj,rk,rl,r11,r12,r0 # Write results back into memory stw r5,0(r3) stw r6,4(r3) stw r7,8(r3) stw r8,12(r3) blr lab_sl2_end: blr END