From cc6559c3f6a81e3b4ad676c2242505466e6e9ec1 Mon Sep 17 00:00:00 2001
From: IrgendwerA8 <c.krueger.b@web.de>
Date: Thu, 28 Feb 2019 17:30:34 +0100
Subject: [PATCH] Minor math optimizations

---
 libsrc/runtime/ludiv.s         | 8 ++++----
 libsrc/runtime/udiv32by16r16.s | 8 ++++----
 libsrc/runtime/umul16x16r32.s  | 4 ++--
 libsrc/runtime/umul8x16r24.s   | 6 ++++++
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/libsrc/runtime/ludiv.s b/libsrc/runtime/ludiv.s
index 54af4780e..8a3126d72 100644
--- a/libsrc/runtime/ludiv.s
+++ b/libsrc/runtime/ludiv.s
@@ -78,7 +78,7 @@ L0:     asl     ptr1
 ; Do a subtraction. we do not have enough space to store the intermediate
 ; result, so we may have to do the subtraction twice.
 
-        pha
+        tax
         cmp     ptr3
         lda     ptr2+1
         sbc     ptr3+1
@@ -91,9 +91,9 @@ L0:     asl     ptr1
 ; Overflow, do the subtraction again, this time store the result
 
         sta     tmp4            ; We have the high byte already
-        pla
+        txa
         sbc     ptr3            ; byte 0
-        pha
+        tax
         lda     ptr2+1
         sbc     ptr3+1
         sta     ptr2+1          ; byte 1
@@ -102,7 +102,7 @@ L0:     asl     ptr1
         sta     tmp3            ; byte 2
         inc     ptr1            ; Set result bit
 
-L1:     pla
+L1:     txa
         dey
         bne     L0
         sta     ptr2
diff --git a/libsrc/runtime/udiv32by16r16.s b/libsrc/runtime/udiv32by16r16.s
index 9897f9908..27f1176dd 100644
--- a/libsrc/runtime/udiv32by16r16.s
+++ b/libsrc/runtime/udiv32by16r16.s
@@ -34,19 +34,19 @@ L0:     asl     ptr1
         rol     a
         rol     sreg+1
 
-        pha
+        tax
         cmp     ptr3
         lda     sreg+1
         sbc     ptr3+1
         bcc     L1
 
         sta     sreg+1
-        pla
+        txa
         sbc     ptr3
-        pha
+        tax
         inc     ptr1
 
-L1:     pla
+L1:     txa
         dey
         bne     L0
         sta     sreg
diff --git a/libsrc/runtime/umul16x16r32.s b/libsrc/runtime/umul16x16r32.s
index 9ecd1596e..cd2dae351 100644
--- a/libsrc/runtime/umul16x16r32.s
+++ b/libsrc/runtime/umul16x16r32.s
@@ -42,11 +42,11 @@ umul16x16r16m:
 
         clc
         adc     ptr3
-        pha
+        tax
         lda     ptr3+1
         adc     sreg+1
         sta     sreg+1
-        pla
+        txa
 
 @L1:    ror     sreg+1
         ror     a
diff --git a/libsrc/runtime/umul8x16r24.s b/libsrc/runtime/umul8x16r24.s
index ff7d0bae6..c006082a4 100644
--- a/libsrc/runtime/umul8x16r24.s
+++ b/libsrc/runtime/umul8x16r24.s
@@ -9,6 +9,7 @@
 
         .include        "zeropage.inc"
 
+        .macpack        cpu
 
 ;---------------------------------------------------------------------------
 ; 8x16 => 24 unsigned multiplication routine. Because the overhead for a
@@ -30,9 +31,14 @@ umul8x16r16:
 
 umul8x16r24m:
 umul8x16r16m:
+.if (.cpu .bitand ::CPU_ISET_65SC02)
+        stz     ptr1+1
+        stz     sreg
+.else
         ldx     #0
         stx     ptr1+1
         stx     sreg
+.endif
 
         ldy     #8              ; Number of bits
         ldx     ptr3            ; Get into register for speed
-- 
2.39.2