#	Michael Riff
#	Assembler rounding software version 1.0  March 2024.

# Actual library. The rounding routines return directly and integer type result
# instead of a floating point value.
# For each rounding, 2 routines are defined: 1 checnging the settings in register
# FPSCR and using instruction fctiw, the other not touching the resgister and
# using instruction fctiwz.
# At the end a routine reading out t rouding slection from FPSCR, and a routine
# changing FPSCR to round to nearrest and converting an array of values.

				dialect	PowerPC

					file	'floor.s'

# Only the sections can be exported (dos not work with labels)
#				export my_floor1[DS]
				export .my_floor1[PR] => 'floor1_s'
#				export my_floor2[DS]
				export .my_floor2[PR] => 'floor2_s'
#				export my_ceil1[DS]
				export .my_ceil1[PR] => 'ceil1_s'
#				export my_ceil2[DS]
				export .my_ceil2[PR] => 'ceil2_s'
#				export my_round1[DS]
				export .my_round1[PR] => 'round1_s'
#				export my_round2[DS]
				export .my_round2[PR] => 'round2_s'
#				export my_test[DS]
				export .my_test[PR] => 'test_s'
#				export rndsel[DS]
				export .rndsel[PR] => 'roundSetting'
#				export rndarray[DS]
				export .rndarray[PR] => 'roundArray'

Flt_Cst:	equ	1

# TOC and transition vectors only necessary if cross TOC calls are implemented (separate code fragments)	
			toc
start1:			tc		my_floor1[TC], my_floor1[DS]
start2:			tc		my_floor2[TC], my_floor2[DS]
	IFDEF Flt_Cst
Csts:			tc		my_const[TC], my_const[RW]
	ENDIF


	linkageArea:		equ		24
	calleesParams:		set		0
	calleesLocalVars:	set		0
	numFPRs:		set		0


##########################
# FLOOR


# Floor with change of FPSCR lower bits
		csect	my_floor1[DS]
				dc.l	.my_floor1[PR]
				dc.l	TOC[tc0]
				
		csect	.my_floor1[PR]
my_floor1:
# Get 64 bits aligned address and 8 bytes on the stack into r12
	addi	r12, r1, -15
	rlwinm	r12, r12, 0, 0, 28
# Save FPSCR at r12 address 
	mffs	fp13
	stfd	fp13, 0(r12)
# If lower 2 bits already 11 round directly
	lwz	r11, 4(r12)
#	stwx	r11, 8(r12)
#	mr	r10, r11
	rlwinm	r10, r11, 0, 30, 31
	cmpi	cr7,1,r10,3
	bc	12,30, Round1	# beq	Round1
# Set lower 2 bits of FPSCR (round towards -INF)
	ori	r11,r11,3
	stw	r11, 4(r12)
	lfd	fp12, 0(r12)
	mtfsf	255, fp12
	fctiw	fp11, fp1
	stfd	fp11, -8(r12)
	lwz		r3, -4(r12)
# Restore FPSCR
#	ldwx	r10, 8(r12)
#	stwx	r10, 4(r12)
#	lfd	fp13, 0(r12)
	mtfsf	255, fp13
	blr
Round1:
	fctiw	fp11, fp1
	stfd	fp11, -8(r12)
	lwz		r3, -4(r12)
	blr


# Floor without change of FPSCR lower bits
		csect	my_floor2[DS]
				dc.l	.my_floor2[PR]
				dc.l	TOC[tc0]

	IFDEF Flt_Cst
		csect	my_const[RW]	; [RO] does not work!
Zero:			dc.d	"0.000000"
Min:			dc.d	"-1.000000"
Pls:			dc.d	"1.000000"
	ENDIF
	
		csect	.my_floor2[PR]
my_floor2:
# Load 1/-1 and 0 into a Fpr
	IFDEF Flt_Cst
	lwz		r10, my_const[TC](r2)		# r10 points to constant 0.0
	lfd		fp12, Zero(r10)	# fp12 = 0
	ENDIF
# Get 64 bits aligned address and 16 bytes on the stack into r11
	addi	r11, r1, -23
	rlwinm	r11, r11, 0, 0, 28
	IFNDEF Flt_Cst
# Set fp12 to 0 using integer registers and memory
#	addi	r12, r0, 0	# r12 = 0
	xor		r12, r12, r12	# r12 = 0
	stw		r12, 0(r11)
	stw		r12, 4(r11)
	lfd		fp12, 0(r11)	# = 0
	ENDIF
# Perform rounding	
	fctiwz	fp10, fp1
	stfd	fp10, 0(r11)	# r11 points to rounded value
# If rounded argument != argument and argument <0 subtract 1
# Check if argument is negative
	fcmpo	cr6, fp1, fp12
	bc		4,24,Ret1		# bge
# Argument is negative. Check if the fractional part is nonzero
	addi	r12,r11,8		# r12 points to original value
	stfd	fp1, 0(r12)
	lwz		r10, 0(r12)
	rlwinm	r9,r10,12,21,31
	addi	r9, r9, -1023	# = exp

# Handling of exponents <=-1  (-1,0 < float values < 0,0)
# The result is always -1
	cmpi	cr7,0,r9,0
	bc		4,28,PositExp1
	xor		r8, r8, r8
	not		r3, r8
	blr

PositExp1:
	cmpi	cr5,0,r9,20
	bc		4,20,LowPart1	# bge cr5
	subfic	r8,r9, 20
	addi	r7, r0,1
	slw		r7, r7, r8
	addi	r7, r7, -1	# mask for 20-exp lower bits
	and.	r10,r10,r7
	bc		4,2,Correct1	# bne: fractional part nonzero
# Process all 32 bits of the lower part
	lwz		r10, 4(r12)
	cmpi	cr7,0,r10,0
	bc		4,30,Correct1	# bne cr7
	b		Ret1
LowPart1:
	lwz		r10, 4(r12)
# Case were the complete 32 bits must be checked
	bc		12,21,Partly1	# bgt cr5
	cmpi	cr7, r10, 0
	bc		12, 30, Ret1		# beq cr7
	b		Correct1
Partly1:
	subfic	r8,r9, 52	# r8 < 32
	addi	r7, r0, 1
	slw		r7, r7, r8
	addi	r7, r7, -1	# mask for 52-exp lower bits
	and.	r10,r10,r7
	bc		12,2,Ret1	# beq: fractional part zero
# Fractional part is nonzero, subtract 1 from rounded result
Correct1:
	lwz		r3, 4(r11)
	addi	r3, r3, -1
	blr
Ret1:
	lwz		r3, 4(r11)
	blr


##########################
# CEIL


# Ceil with change of FPSCR lower bits
		csect	my_ceil1[DS]
				dc.l	.my_ceil1[PR]
				dc.l	TOC[tc0]
				
		csect	.my_ceil1[PR]
my_ceil1:
# Get 64 bits aligned address and 8 bytes on the stack into r12
	addi	r12, r1, -15
	rlwinm	r12, r12, 0, 0, 28
# Save FPSCR at r12 address 
	mffs	fp13
	stfd	fp13, 0(r12)
# If lower 2 bits already 11 round directly
	lwz	r11, 4(r12)
#	stwx	r11, 8(r12)
#	mr	r10, r11
	rlwinm	r10, r11, 0, 30, 31
	cmpi	cr7,1,r10,2
	bc	12,30, Round2	# beq	Round2
# Set lower 2 bits of FPSCR to 10 (round towards +INF)
	addi	r9,0,3
	andc	r11,r11,r9
	ori		r11,r11,2
	stw	r11, 4(r12)
	lfd	fp12, 0(r12)
	mtfsf	255, fp12
	fctiw	fp11, fp1
	stfd	fp11, -8(r12)
	lwz		r3, -4(r12)
# Restore FPSCR
#	ldwx	r10, 8(r12)
#	stwx	r10, 4(r12)
#	lfd	fp13, 0(r12)
	mtfsf	255, fp13
	blr
Round2:
	fctiw	fp11, fp1
	stfd	fp11, -8(r12)
	lwz		r3, -4(r12)
	blr


# Ceil without change of FPSCR lower bits
		csect	my_ceil2[DS]
				dc.l	.my_ceil2[PR]
				dc.l	TOC[tc0]
	
		csect	.my_ceil2[PR]
my_ceil2:
# Load 1/-1 and 0 into a Fpr
	IFDEF Flt_Cst
	lwz		r10, my_const[TC](r2)		# r10 points to constant 0.0
	lfd		fp12, Zero(r10)	# fp12 = 0
	ENDIF
# Get 64 bits aligned address and 16 bytes on the stack into r11
	addi	r11, r1, -23
	rlwinm	r11, r11, 0, 0, 28
	IFNDEF Flt_Cst
# Set fp12 to 0 using integer registers and memory
#	addi	r12, r0, 0	# r12 = 0
	xor		r12, r12, r12	# r12 = 0
	stw		r12, 0(r11)
	stw		r12, 4(r11)
	lfd		fp12, 0(r11)	# = 0
	ENDIF
# Perform rounding	
	fctiwz	fp10, fp1
	stfd	fp10, 0(r11)	# r11 points to rounded value
# If rounded argument != argument and argument >0 add 1
# Check if argument is positive
	fcmpo	cr6, fp1, fp12
	bc		4,25,Ret2		# ble
# Argument is positive and not zero. Check if the fractional part is nonzero
	addi	r12,r11,8		# r12 points to original value
	stfd	fp1, 0(r12)
	lwz		r10, 0(r12)
	rlwinm	r9,r10,12,21,31
	addi	r9, r9, -1023	# = exp
	
# Handling of exponents <=-1  (0,0 <= float values < 1,0)
# The result is always +1
	cmpi	cr7,0,r9,0
	bc		4,28,PositExp2
	addi	r3, r0,1
	blr

PositExp2:
	cmpi	cr5,0,r9,20
	bc		4,20,LowPart2	# bge cr5
	subfic	r8,r9, 20
	addi	r7, r0,1
	slw		r7, r7, r8
	addi	r7, r7, -1	# mask for 20-exp lower bits
	and.	r10,r10,r7
	bc		4,2,Correct2	# bne: fractional part nonzero
# Process all 32 bits of the lower part
	lwz		r10, 4(r12)
	cmpi	cr7,0,r10,0
	bc		4,30,Correct2	# bne cr7
	b		Ret2
LowPart2:
	lwz		r10, 4(r12)
# Case were the complete 32 bits must be checked
	bc		12,21,Partly2	# bgt cr5
	cmpi	cr7, r10, 0
	bc		12, 30, Ret2		# beq cr7
	b		Correct2
Partly2:
	subfic	r8,r9, 52	# r8 < 32
	addi	r7, r0, 1
	slw		r7, r7, r8
	addi	r7, r7, -1	# mask for 52-exp lower bits
	and.	r10,r10,r7
	bc		12,2,Ret2	# beq: fractional part zero
# Fractional part is nonzero, subtract 1 from rounded result
Correct2:
	lwz		r3, 4(r11)
	addi	r3, r3, 1
	blr
Ret2:
	lwz		r3, 4(r11)
	blr


##########################
# ROUND	TO NEAREST


# Round with change of FPSCR lower bits
		csect	my_round1[DS]
				dc.l	.my_round1[PR]
				dc.l	TOC[tc0]
				
		csect	.my_round1[PR]
my_round1:
# Get 64 bits aligned address and 8 bytes on the stack into r12
	addi	r12, r1, -15
	rlwinm	r12, r12, 0, 0, 28
# Save FPSCR at r12 address 
	mffs	fp13
	stfd	fp13, 0(r12)
# If lower 2 bits already 11 round directly
	lwz	r11, 4(r12)
#	stwx	r11, 8(r12)
#	mr	r10, r11
	rlwinm	r10, r11, 0, 30, 31
	cmpi	cr7,1,r10,0
	bc	12,30, Round3	# beq	Round2
# Set lower 2 bits of FPSCR to 00 (round to nearest)
	addi	r9,0,3
	andc	r11,r11,r9
	stw	r11, 4(r12)
	lfd	fp12, 0(r12)
	mtfsf	255, fp12
	fctiw	fp11, fp1
	stfd	fp11, -8(r12)
	lwz		r3, -4(r12)
# Restore FPSCR
#	ldwx	r10, 8(r12)
#	stwx	r10, 4(r12)
#	lfd	fp13, 0(r12)
	mtfsf	255, fp13
	blr	
Round3:
	fctiw	fp11, fp1
	stfd	fp11, -8(r12)
	lwz		r3, -4(r12)
	blr


# Round without change of FPSCR lower bits
# Note : to make things simpler, 0,5 is rounded towards higher mantissa (away from zero)
# integer value(negative -> down and positive -> up)
		csect	my_round2[DS]
				dc.l	.my_round2[PR]
				dc.l	TOC[tc0]
	
		csect	.my_round2[PR]
my_round:
# Load 1/-1 and 0 into a Fpr
	IFDEF Flt_Cst
	lwz		r10, my_const[TC](r2)		# r10 points to constant 0.0
	lfd		fp12, Zero(r10)	# fp12 = 0
	ENDIF
# Get 64 bits aligned address and 16 bytes on the stack into r11
	addi	r11, r1, -23
	rlwinm	r11, r11, 0, 0, 28
	IFNDEF Flt_Cst
# Set fp12 to 0 using integer registers and memory
#	addi	r12, r0, 0	# r12 = 0
	xor		r12, r12, r12	# r12 = 0
	stw		r12, 0(r11)
	stw		r12, 4(r11)
	lfd		fp12, 0(r11)	# = 0
	ENDIF
# Perform rounding	
	fctiwz	fp10, fp1
	stfd	fp10, 0(r11)	# r11 points to rounded value
# Check if the first bit of the fractional part is 1
	addi	r12,r11,8		# r12 points to original value
	stfd	fp1, 0(r12)
	lwz		r10, 0(r12)
	rlwinm	r9,r10,12,21,31
	addi	r9, r9, -1023	# = exp

# Handling of exponents <=-1  (1,0 < float values < 1,0)
	cmpi	cr7,0,r9,0
	bc		4,28,PositExp3
# If exp = -1 rounded val is +-1 else is 0
	addi	r8,r9, 2		# <= 1
	srawi	r7,r8,31		# if <0 = 0xFFFFFFF other =0
	andc	r6,r8,r7		# is now 0 or 1 (if exp <-1 resp =-1)
# Add sign of argument
	srawi	r9,r10,31		# -1 if arg <0 0 otherwise
	slwi	r10,r9,1
	addi	r10,r10,1
	mullw	r3,r10,r6
	blr
	
PositExp3:
	cmpi	cr5,0,r9,20
	bc		4,20,LowPart3	# bge cr5
	subfic	r8,r9, 19
	addi	r7, r0,1
	slw		r7, r7, r8	# mask for bit 20-exp-1 (start with 0)
	and.	r10,r10,r7
	bc		4,2,Correct3	# bne: fractional first bit nonzero
	b	Ret3
LowPart3:
# Exponent is >=20, first bit of fractional part is in lower 32 bits
	lwz		r10, 4(r12)
	subfic	r8,r9, 51	# r8 < 32
	addi	r7, r0, 1
	slw		r7, r7, r8	# mask for bit 52-exp	(start with 0)
	and.	r10,r10,r7
	bc		12,2,Ret3
Correct3:
# Fractional part>0,5. If argument positive, add 1, if negative subtract 1
	lwz		r3, 4(r11)
# Check if argument is positive (or 0)
	fcmpo	cr6, fp1, fp12
	bc		4,24,Correct4		# bge cr6
# Argument is negative
	addi	r3, r3, -1
	blr
Correct4:
	addi	r3, r3, 1
	blr
Ret3:
	lwz		r3, 4(r11)
	blr	

	
##########################
# TEST ROUTINES


# Routine returning the rounding selection of FPSCR
		csect	rndsel[DS]
				dc.l	.rndsel[PR]
				dc.l	TOC[tc0]
		csect	.rndsel[PR]
rndsel:
# Get 64 bits aligned address and 8 bytes on the stack into r12
	addi	r12, r1, -15
	rlwinm	r12, r12, 0, 0, 28
# Save FPSCR at r12 address 
	mffs	fp13
	stfd	fp13, 0(r12)
# If lower 2 bits already 11 round directly
	lwz	r11, 4(r12)
#	stwx	r11, 8(r12)
#	mr	r10, r11
	rlwinm	r3, r11, 0, 30, 31
	blr


# Routine performing the rounding of a set of floating values
		csect	rndarray[DS]
				dc.l	.rndarray[PR]
				dc.l	TOC[tc0]
		csect	.rndarray[PR]
rndarray:
# Get 64 bits aligned address and 8 bytes on the stack into r12
	addi	r12, r1, -15
	rlwinm	r12, r12, 0, 0, 28
# Save FPSCR at r12 address 
	mffs	fp13
	stfd	fp13, 0(r12)
# Set lower 2 bits of FPSCR to 00 (round to nearest)
	lwz	r11, 4(r12)
	addi	r9,0,3
	andc	r11,r11,r9
	stw	r11, 4(r12)
	lfd	fp12, 0(r12)
	mtfsf	255, fp12
# Convert array
	mtctr	r3
	xor	r9,r9,r9
	xor	r10,r10,r10
loop:
	lfdx	fp10,r9,r4
	fctiw	fp9,fp10
	stfd	fp9,-8(r12)
	lwz	r8,-4(r12)
	stwx	r8,r10,r5
	addi	r9,r9,8
	addi	r10,r10,4
	bc	17,0,loop		# Dec ctr, branch if /=0
# Restore FPSCR
#	ldwx	r10, 8(r12)
#	stwx	r10, 4(r12)
#	lfd	fp13, 0(r12)
	mtfsf	255, fp13
	blr

	End