From acd1233bf2c7d9129647306c6fefa96fa3ec4d0a Mon Sep 17 00:00:00 2001 From: uz Date: Mon, 17 Aug 2009 20:48:28 +0000 Subject: [PATCH] Greatly improved multiplication routine. Optimized the generic 16x16 one and added special cases for 8x16 and 8x8. The former is directly called by the compiler as tosmula0 and tosumula0 resp. git-svn-id: svn://svn.cc65.org/cc65/trunk@4036 b7a2c559-68d2-44c3-8de9-860c34a00d81 --- libsrc/runtime/Makefile | 1 + libsrc/runtime/mul.s | 92 ++++++++++++++++++++++++++--------------- libsrc/runtime/mul8.s | 63 ++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 33 deletions(-) create mode 100644 libsrc/runtime/mul8.s diff --git a/libsrc/runtime/Makefile b/libsrc/runtime/Makefile index 14158398e..10a5f0e8d 100644 --- a/libsrc/runtime/Makefile +++ b/libsrc/runtime/Makefile @@ -152,6 +152,7 @@ OBJS = add.o \ makebool.o \ mod.o \ mul.o \ + mul8.o \ mulax3.o \ mulax5.o \ mulax6.o \ diff --git a/libsrc/runtime/mul.s b/libsrc/runtime/mul.s index 67caf52cf..eaf1fb97b 100644 --- a/libsrc/runtime/mul.s +++ b/libsrc/runtime/mul.s @@ -1,43 +1,69 @@ ; -; Ullrich von Bassewitz, 07.08.1998 +; Ullrich von Bassewitz, 2009-08-17 ; ; CC65 runtime: multiplication for ints ; - .export tosumula0, tosumulax, tosmula0, tosmulax - .import popsreg - .importzp sreg, tmp1, ptr4 + .export tosumulax, tosmulax + .import mul8x16, mul8x16a ; in mul8.s + .import popsreg + .importzp sreg, tmp1, ptr4 + + +;--------------------------------------------------------------------------- +; 16x16 multiplication routine -tosmula0: -tosumula0: - ldx #0 tosmulax: tosumulax: -mul16: sta ptr4 - stx ptr4+1 ; Save right operand - jsr popsreg ; Get left operand - -; Do ptr4*sreg --> AX (see mult-div.s from "The Fridge"). - - lda #0 - sta tmp1 - ldx sreg+1 ; Get into register for speed - ldy #16 ; Number of bits -L0: lsr tmp1 - ror a - ror ptr4+1 - ror ptr4 - bcc L1 - clc - adc sreg - pha + sta ptr4 + txa ; High byte zero + beq @L3 ; Do 8x16 multiplication if high byte zero + stx ptr4+1 ; Save right operand + jsr popsreg ; Get left operand + +; Do ptr4:ptr4+1 * sreg:sreg+1 --> AX + + lda #0 + ldx sreg+1 ; Get high byte into register for speed + beq @L4 ; -> we can do 8x16 after swap + sta tmp1 + ldy #16 ; Number of bits + + lsr ptr4+1 + ror ptr4 ; Get first bit into carry +@L0: bcc @L1 + + clc + adc sreg + pha txa ; hi byte of left op - adc tmp1 - sta tmp1 - pla -L1: dey - bpl L0 - lda ptr4 ; Load the result - ldx ptr4+1 - rts ; Done + adc tmp1 + sta tmp1 + pla + +@L1: ror tmp1 + ror a + ror ptr4+1 + ror ptr4 + dey + bne @L0 + + lda ptr4 ; Load the result + ldx ptr4+1 + rts ; Done + +; High byte of rhs is zero, jump to the 8x16 routine instead + +@L3: jmp mul8x16 + +; If the high byte of rhs is zero, swap the operands and use the 8x16 +; routine. On entry, A and X are zero + +@L4: ldy sreg ; Save right operand (8 bit) + ldx ptr4 ; Copy left 16 bit operand to right + stx sreg + ldx ptr4+1 ; Don't store, this is done later + sty ptr4 ; Copy low 8 bit of right op to left + ldy #8 + jmp mul8x16a diff --git a/libsrc/runtime/mul8.s b/libsrc/runtime/mul8.s new file mode 100644 index 000000000..3287e2155 --- /dev/null +++ b/libsrc/runtime/mul8.s @@ -0,0 +1,63 @@ +; +; Ullrich von Bassewitz, 2009-08-17 +; +; CC65 runtime: multiplication for ints. Short versions. +; + + .export tosumula0, tosmula0 + .export mul8x16, mul8x16a + .import popsreg + .importzp sreg, ptr4 + + +;--------------------------------------------------------------------------- +; 8x16 routine with external entry points used by the 16x16 routine in mul.s + +tosmula0: +tosumula0: + sta ptr4 +mul8x16:jsr popsreg ; Get left operand + + lda #0 ; Clear byte 1 + ldy #8 ; Number of bits + ldx sreg+1 ; Get into register for speed + beq mul8x8 ; Do 8x8 multiplication if high byte zero +mul8x16a: + sta ptr4+1 ; Clear byte 2 + + lsr ptr4 ; Get first bit into carry +@L0: bcc @L1 + + clc + adc sreg + pha + txa ; hi byte of left op + adc ptr4+1 + sta ptr4+1 + pla + +@L1: ror ptr4+1 + ror a + ror ptr4 + dey + bne @L0 + tax + lda ptr4 ; Load the result + rts + +;--------------------------------------------------------------------------- +; 8x8 multiplication routine + +mul8x8: + lsr ptr4 ; Get first bit into carry +@L0: bcc @L1 + clc + adc sreg +@L1: ror + ror ptr4 + dey + bne @L0 + tax + lda ptr4 ; Load the result + rts ; Done + -- 2.39.5