From acd1233bf2c7d9129647306c6fefa96fa3ec4d0a Mon Sep 17 00:00:00 2001
From: uz <uz@b7a2c559-68d2-44c3-8de9-860c34a00d81>
Date: Mon, 17 Aug 2009 20:48:28 +0000
Subject: [PATCH] Greatly improved multiplication routine. Optimized the
 generic 16x16 one and added special cases for 8x16 and 8x8. The former is
 directly called by the compiler as tosmula0 and tosumula0 resp.

git-svn-id: svn://svn.cc65.org/cc65/trunk@4036 b7a2c559-68d2-44c3-8de9-860c34a00d81
---
 libsrc/runtime/Makefile |  1 +
 libsrc/runtime/mul.s    | 92 ++++++++++++++++++++++++++---------------
 libsrc/runtime/mul8.s   | 63 ++++++++++++++++++++++++++++
 3 files changed, 123 insertions(+), 33 deletions(-)
 create mode 100644 libsrc/runtime/mul8.s

diff --git a/libsrc/runtime/Makefile b/libsrc/runtime/Makefile
index 14158398e..10a5f0e8d 100644
--- a/libsrc/runtime/Makefile
+++ b/libsrc/runtime/Makefile
@@ -152,6 +152,7 @@ OBJS = 	add.o		\
        	makebool.o	\
        	mod.o  		\
        	mul.o  		\
+        mul8.o          \
 	mulax3.o	\
 	mulax5.o	\
 	mulax6.o	\
diff --git a/libsrc/runtime/mul.s b/libsrc/runtime/mul.s
index 67caf52cf..eaf1fb97b 100644
--- a/libsrc/runtime/mul.s
+++ b/libsrc/runtime/mul.s
@@ -1,43 +1,69 @@
 ;
-; Ullrich von Bassewitz, 07.08.1998
+; Ullrich von Bassewitz, 2009-08-17
 ;
 ; CC65 runtime: multiplication for ints
 ;
 
-       	.export		tosumula0, tosumulax, tosmula0, tosmulax
-	.import		popsreg
-	.importzp	sreg, tmp1, ptr4
+       	.export		tosumulax, tosmulax
+        .import         mul8x16, mul8x16a       ; in mul8.s
+    	.import		popsreg
+    	.importzp	sreg, tmp1, ptr4
+
+
+;---------------------------------------------------------------------------
+; 16x16 multiplication routine
 
-tosmula0:
-tosumula0:
-	ldx	#0
 tosmulax:
 tosumulax:
-mul16:	sta	ptr4
-      	stx	ptr4+1 	       	; Save right operand
-      	jsr	popsreg	       	; Get left operand
-
-; Do ptr4*sreg --> AX (see mult-div.s from "The Fridge").
-
-	lda	#0
-   	sta	tmp1
-	ldx	sreg+1	       	; Get into register for speed
-   	ldy    	#16 	       	; Number of bits
-L0:	lsr	tmp1
-   	ror	a
-   	ror	ptr4+1
-   	ror	ptr4
-   	bcc	L1
-   	clc
-   	adc	sreg
-   	pha
+        sta	ptr4
+        txa                     ; High byte zero
+        beq     @L3             ; Do 8x16 multiplication if high byte zero
+       	stx	ptr4+1 	       	; Save right operand
+       	jsr	popsreg	       	; Get left operand
+
+; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX
+
+       	lda	#0
+       	ldx	sreg+1	       	; Get high byte into register for speed
+        beq     @L4             ; -> we can do 8x16 after swap
+       	sta	tmp1
+       	ldy    	#16 	       	; Number of bits
+
+        lsr     ptr4+1
+        ror     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+
+      	clc
+      	adc	sreg
+      	pha
        	txa	    	       	; hi byte of left op
-   	adc	tmp1
-   	sta	tmp1
-   	pla
-L1:	dey
-       	bpl    	L0
-	lda	ptr4	       	; Load the result
-	ldx	ptr4+1
-	rts			; Done
+      	adc	tmp1
+      	sta	tmp1
+      	pla
+
+@L1:    ror     tmp1
+     	ror	a
+     	ror	ptr4+1
+     	ror	ptr4
+        dey
+        bne     @L0
+
+      	lda	ptr4	       	; Load the result
+      	ldx	ptr4+1
+      	rts	    		; Done
+
+; High byte of rhs is zero, jump to the 8x16 routine instead
+
+@L3:    jmp     mul8x16
+
+; If the high byte of rhs is zero, swap the operands and use the 8x16
+; routine. On entry, A and X are zero
+
+@L4:    ldy     sreg            ; Save right operand (8 bit)
+        ldx     ptr4            ; Copy left 16 bit operand to right
+        stx     sreg
+        ldx     ptr4+1          ; Don't store, this is done later
+        sty     ptr4            ; Copy low 8 bit of right op to left
+        ldy     #8
+        jmp     mul8x16a
 
diff --git a/libsrc/runtime/mul8.s b/libsrc/runtime/mul8.s
new file mode 100644
index 000000000..3287e2155
--- /dev/null
+++ b/libsrc/runtime/mul8.s
@@ -0,0 +1,63 @@
+;
+; Ullrich von Bassewitz, 2009-08-17
+;
+; CC65 runtime: multiplication for ints. Short versions.
+;
+
+       	.export		tosumula0, tosmula0
+        .export         mul8x16, mul8x16a
+    	.import		popsreg
+    	.importzp	sreg, ptr4
+
+
+;---------------------------------------------------------------------------
+; 8x16 routine with external entry points used by the 16x16 routine in mul.s
+
+tosmula0:
+tosumula0:
+        sta   	ptr4
+mul8x16:jsr   	popsreg	       	; Get left operand
+
+    	lda   	#0              ; Clear byte 1
+       	ldy    	#8    	       	; Number of bits
+    	ldx   	sreg+1	       	; Get into register for speed
+        beq     mul8x8          ; Do 8x8 multiplication if high byte zero
+mul8x16a:
+    	sta   	ptr4+1          ; Clear byte 2
+
+        lsr     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+
+      	clc
+      	adc   	sreg
+      	pha
+       	txa   	    	       	; hi byte of left op
+      	adc   	ptr4+1
+      	sta   	ptr4+1
+      	pla
+
+@L1:    ror    	ptr4+1
+      	ror   	a
+      	ror   	ptr4
+        dey
+        bne     @L0
+        tax
+        lda     ptr4            ; Load the result
+        rts
+
+;---------------------------------------------------------------------------
+; 8x8 multiplication routine
+
+mul8x8:
+        lsr     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+        clc
+        adc     sreg
+@L1:    ror
+        ror     ptr4
+        dey
+        bne     @L0
+        tax
+    	lda	ptr4  	       	; Load the result
+    	rts	   		; Done
+
-- 
2.39.5