]> git.sur5r.net Git - cc65/commitdiff
Greatly improved multiplication routine. Optimized the generic 16x16 one and
authoruz <uz@b7a2c559-68d2-44c3-8de9-860c34a00d81>
Mon, 17 Aug 2009 20:48:28 +0000 (20:48 +0000)
committeruz <uz@b7a2c559-68d2-44c3-8de9-860c34a00d81>
Mon, 17 Aug 2009 20:48:28 +0000 (20:48 +0000)
added special cases for 8x16 and 8x8. The former is directly called by the
compiler as tosmula0 and tosumula0 resp.

git-svn-id: svn://svn.cc65.org/cc65/trunk@4036 b7a2c559-68d2-44c3-8de9-860c34a00d81

libsrc/runtime/Makefile
libsrc/runtime/mul.s
libsrc/runtime/mul8.s [new file with mode: 0644]

index 14158398e36ef063595a6cfdf03f02e59f306a8f..10a5f0e8daf1a849575e48fc51fb97aff7b5da41 100644 (file)
@@ -152,6 +152,7 @@ OBJS =      add.o           \
                makebool.o      \
                mod.o           \
                mul.o           \
+        mul8.o          \
        mulax3.o        \
        mulax5.o        \
        mulax6.o        \
index 67caf52cf5a5419cfbc06ed333d1341ff324151a..eaf1fb97b86144a0da3b869f5c168dd896a45a09 100644 (file)
@@ -1,43 +1,69 @@
 ;
-; Ullrich von Bassewitz, 07.08.1998
+; Ullrich von Bassewitz, 2009-08-17
 ;
 ; CC65 runtime: multiplication for ints
 ;
 
-               .export         tosumula0, tosumulax, tosmula0, tosmulax
-       .import         popsreg
-       .importzp       sreg, tmp1, ptr4
+               .export         tosumulax, tosmulax
+        .import         mul8x16, mul8x16a       ; in mul8.s
+       .import         popsreg
+       .importzp       sreg, tmp1, ptr4
+
+
+;---------------------------------------------------------------------------
+; 16x16 multiplication routine
 
-tosmula0:
-tosumula0:
-       ldx     #0
 tosmulax:
 tosumulax:
-mul16: sta     ptr4
-       stx     ptr4+1          ; Save right operand
-       jsr     popsreg         ; Get left operand
-
-; Do ptr4*sreg --> AX (see mult-div.s from "The Fridge").
-
-       lda     #0
-       sta     tmp1
-       ldx     sreg+1          ; Get into register for speed
-       ldy     #16             ; Number of bits
-L0:    lsr     tmp1
-       ror     a
-       ror     ptr4+1
-       ror     ptr4
-       bcc     L1
-       clc
-       adc     sreg
-       pha
+        sta    ptr4
+        txa                     ; High byte zero
+        beq     @L3             ; Do 8x16 multiplication if high byte zero
+               stx     ptr4+1          ; Save right operand
+               jsr     popsreg         ; Get left operand
+
+; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX
+
+               lda     #0
+               ldx     sreg+1          ; Get high byte into register for speed
+        beq     @L4             ; -> we can do 8x16 after swap
+               sta     tmp1
+               ldy     #16             ; Number of bits
+
+        lsr     ptr4+1
+        ror     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+
+       clc
+       adc     sreg
+       pha
                txa                     ; hi byte of left op
-       adc     tmp1
-       sta     tmp1
-       pla
-L1:    dey
-               bpl     L0
-       lda     ptr4            ; Load the result
-       ldx     ptr4+1
-       rts                     ; Done
+       adc     tmp1
+       sta     tmp1
+       pla
+
+@L1:    ror     tmp1
+       ror     a
+       ror     ptr4+1
+       ror     ptr4
+        dey
+        bne     @L0
+
+       lda     ptr4            ; Load the result
+       ldx     ptr4+1
+       rts                     ; Done
+
+; High byte of rhs is zero, jump to the 8x16 routine instead
+
+@L3:    jmp     mul8x16
+
+; If the high byte of rhs is zero, swap the operands and use the 8x16
+; routine. On entry, A and X are zero
+
+@L4:    ldy     sreg            ; Save right operand (8 bit)
+        ldx     ptr4            ; Copy left 16 bit operand to right
+        stx     sreg
+        ldx     ptr4+1          ; Don't store, this is done later
+        sty     ptr4            ; Copy low 8 bit of right op to left
+        ldy     #8
+        jmp     mul8x16a
 
diff --git a/libsrc/runtime/mul8.s b/libsrc/runtime/mul8.s
new file mode 100644 (file)
index 0000000..3287e21
--- /dev/null
@@ -0,0 +1,63 @@
+;
+; Ullrich von Bassewitz, 2009-08-17
+;
+; CC65 runtime: multiplication for ints. Short versions.
+;
+
+               .export         tosumula0, tosmula0
+        .export         mul8x16, mul8x16a
+       .import         popsreg
+       .importzp       sreg, ptr4
+
+
+;---------------------------------------------------------------------------
+; 8x16 routine with external entry points used by the 16x16 routine in mul.s
+
+tosmula0:
+tosumula0:
+        sta    ptr4
+mul8x16:jsr    popsreg         ; Get left operand
+
+       lda     #0              ; Clear byte 1
+               ldy     #8              ; Number of bits
+       ldx     sreg+1          ; Get into register for speed
+        beq     mul8x8          ; Do 8x8 multiplication if high byte zero
+mul8x16a:
+       sta     ptr4+1          ; Clear byte 2
+
+        lsr     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+
+       clc
+       adc     sreg
+       pha
+               txa                     ; hi byte of left op
+       adc     ptr4+1
+       sta     ptr4+1
+       pla
+
+@L1:    ror            ptr4+1
+       ror     a
+       ror     ptr4
+        dey
+        bne     @L0
+        tax
+        lda     ptr4            ; Load the result
+        rts
+
+;---------------------------------------------------------------------------
+; 8x8 multiplication routine
+
+mul8x8:
+        lsr     ptr4            ; Get first bit into carry
+@L0:    bcc     @L1
+        clc
+        adc     sreg
+@L1:    ror
+        ror     ptr4
+        dey
+        bne     @L0
+        tax
+       lda     ptr4            ; Load the result
+       rts                     ; Done
+