From 0e91f15ea85fc6cf5a0c91d8691d92db0472f79c Mon Sep 17 00:00:00 2001
From: uz <uz@b7a2c559-68d2-44c3-8de9-860c34a00d81>
Date: Sun, 20 Sep 2009 14:32:25 +0000
Subject: [PATCH] Improved memset/memcpy/memmove functions by Christian
 Krueger.

git-svn-id: svn://svn.cc65.org/cc65/trunk@4200 b7a2c559-68d2-44c3-8de9-860c34a00d81
---
 libsrc/common/memcpy.s  | 90 +++++++++++++++++++++++------------------
 libsrc/common/memmove.s | 69 +++++++++++++++++++------------
 libsrc/common/memset.s  | 76 ++++++++++++++++++++++------------
 3 files changed, 143 insertions(+), 92 deletions(-)

diff --git a/libsrc/common/memcpy.s b/libsrc/common/memcpy.s
index ff2c4bdde..d432ffa0e 100644
--- a/libsrc/common/memcpy.s
+++ b/libsrc/common/memcpy.s
@@ -1,5 +1,7 @@
 ;
 ; Ullrich von Bassewitz, 2003-08-20
+; Performance increase (about 20%) by
+; Christian Krueger, 2009-09-13
 ;
 ; void* __fastcall__ memcpy (void* dest, const void* src, size_t n);
 ;
@@ -10,61 +12,69 @@
 
        	.export	    	_memcpy, memcpy_upwards, memcpy_getparams
        	.import	    	popax
-       	.importzp      	ptr1, ptr2, ptr3, tmp1
+       	.importzp      	sp, ptr1, ptr2, ptr3
 
 ; ----------------------------------------------------------------------
 _memcpy:
         jsr     memcpy_getparams
 
-memcpy_upwards:
-        ldy     #0
-        ldx     ptr3            ; Get low counter byte
+memcpy_upwards:			; assert Y = 0
+	ldx	ptr3+1 		; Get high byte of n
+       	beq    	L2		; Jump if zero
 
-; Copy loop
+L1:	.repeat 2		; Unroll this a bit to make it faster...
+	lda	(ptr1),Y	; copy a byte
+	sta	(ptr2),Y
+	iny
+	.endrepeat
+	bne	L1
+	inc	ptr1+1
+	inc	ptr2+1
+	dex			; Next 256 byte block
+	bne	L1		; Repeat if any
 
-@L1:    inx                     ; Bump low counter byte
-        beq     @L3             ; Jump on overflow
-@L2:    lda     (ptr1),y
-        sta     (ptr2),y
-        iny
-        bne     @L1
-       	inc   	ptr1+1		; Bump pointers
-       	inc   	ptr2+1
-        bne     @L1             ; Branch always
-@L3:    inc     ptr3+1          ; Bump high counter byte
-        bne     @L2
+	; the following section could be 10% faster if we were able to copy
+	; back to front - unfortunately we are forced to copy strict from
+	; low to high since this function is also used for
+	; memmove and blocks could be overlapping!
+	; {
+L2:				; assert Y = 0
+	ldx	ptr3		; Get the low byte of n
+	beq	done		; something to copy
 
-; Done. The low byte of dest is still in ptr2
+L3:	lda	(ptr1),Y	; copy a byte
+	sta	(ptr2),Y
+	iny
+	dex
+	bne	L3
 
-done:  	lda	ptr2
-       	ldx    	tmp1            ; get function result (dest)
-       	rts
+	; }
+
+done:	jmp	popax		; Pop ptr and return as result
 
 ; ----------------------------------------------------------------------
 ; Get the parameters from stack as follows:
 ;
-;       -(size-1)       --> ptr3
+;       size      	--> ptr3
 ;       src             --> ptr1
 ;       dest            --> ptr2
-;       high(dest)      --> tmp1
-;
-; dest is returned in a/x.
-
-memcpy_getparams:
-        eor     #$FF
-        sta     ptr3
-        txa
-        eor     #$FF
-        sta     ptr3+1          ; Save -(size-1)
-
-       	jsr	popax		; src
-       	sta	ptr1
-       	stx	ptr1+1
+;	First argument (dest) will remain on stack and is returned in a/x!
 
-       	jsr	popax		; dest
-  	sta	ptr2
-  	stx	ptr2+1		; Save work copy
-        stx     tmp1            ; Save for function result
+memcpy_getparams:		; IMPORTANT! Function has to leave with Y=0!
+	sta     ptr3
+        stx     ptr3+1          ; save n to ptr3
 
-        rts
+	jsr	popax
+	sta	ptr1
+	stx	ptr1+1		; save src to ptr1
 
+				; save dest to ptr2
+       	ldy     #1     	       	; (direct stack access is three cycles faster
+                                ; (total cycle count with return))
+        lda     (sp),y
+	tax
+        stx	ptr2+1		; save high byte of ptr2
+        dey			; Y = 0
+        lda     (sp),y          ; Get ptr2 low
+ 	sta	ptr2
+	rts
diff --git a/libsrc/common/memmove.s b/libsrc/common/memmove.s
index f344f9df8..983b97277 100644
--- a/libsrc/common/memmove.s
+++ b/libsrc/common/memmove.s
@@ -1,5 +1,7 @@
 ;
 ; Ullrich von Bassewitz, 2003-08-20
+; Performance increase (about 20%) by
+; Christian Krueger, 2009-09-13
 ;
 ; void* __fastcall__ memmove (void* dest, const void* src, size_t size);
 ;
@@ -7,7 +9,7 @@
 ;
 
        	.export	    	_memmove
-        .import         memcpy_getparams, memcpy_upwards
+        .import         memcpy_getparams, memcpy_upwards, popax
        	.importzp      	ptr1, ptr2, ptr3, ptr4, tmp1
 
         .macpack        generic
@@ -15,9 +17,6 @@
 
 ; ----------------------------------------------------------------------
 _memmove:
-        sta     ptr4
-        stx     ptr4+1          ; Size -> ptr4
-
         jsr     memcpy_getparams
 
 ; Check for the copy direction. If dest < src, we must copy upwards (start at
@@ -33,35 +32,53 @@ _memmove:
 ; Copy downwards. Adjust the pointers to the end of the memory regions.
 
         lda	ptr1+1
-       	add	ptr4+1
+       	add	ptr3+1
 	sta	ptr1+1
 
 	lda	ptr2+1
-	add	ptr4+1
+	add	ptr3+1
 	sta	ptr2+1
 
-; Load the low offset into Y, and the counter low byte into X.
-
-        ldy     ptr4
-        ldx     ptr3
-        jmp     @L2
-
-; Copy loop
-
-@L1:    dey
+; handle fractions of a page size first
+
+	ldy	ptr3		; count, low byte
+	bne	@entry		; something to copy?
+	beq	PageSizeCopy	; here like bra...
+
+@copyByte:
+	lda	(ptr1),y
+	sta     (ptr2),y
+@entry:
+	dey
+	bne	@copyByte
+	lda	(ptr1),y	; copy remaining byte
+	sta     (ptr2),y
+
+PageSizeCopy:			; assert Y = 0
+	ldx	ptr3+1		; number of pages
+	beq	done		; none? -> done
+
+@initBase:
+	dec	ptr1+1		; adjust base...
+	dec	ptr2+1
+	dey			; in entry case: 0 -> FF
+        lda     (ptr1),y	; need to copy this 'intro byte'
+        sta     (ptr2),y	; to 'land' later on Y=0! (as a result of the '.repeat'-block!)
+	dey			; FF ->FE
+@copyBytes:
+	.repeat 2		; Unroll this a bit to make it faster...
         lda     (ptr1),y
         sta     (ptr2),y
-
-@L2:    inx                     ; Bump counter low byte
-        bne     @L1
-        dec     ptr1+1
-        dec     ptr2+1
-        inc     ptr3+1          ; Bump counter high byte
-        bne     @L1
+	dey
+	.endrepeat
+@copyEntry:			; in entry case: 0 -> FF
+	bne	@copyBytes
+	lda     (ptr1),y	; Y = 0, copy last byte
+        sta     (ptr2),y
+	dex			; one page to copy less
+	bne	@initBase	; still a page to copy?
 
 ; Done, return dest
 
-done:  	lda	ptr2
-       	ldx    	tmp1            ; get function result (dest)
-       	rts
-
+done:  	jmp	popax		; Pop ptr and return as result
+                
diff --git a/libsrc/common/memset.s b/libsrc/common/memset.s
index 62c83fb5e..fcdbd98bc 100644
--- a/libsrc/common/memset.s
+++ b/libsrc/common/memset.s
@@ -1,9 +1,11 @@
 ;
-; void* memset (void* ptr, int c, size_t n);
-; void* _bzero (void* ptr, size_t n);
-; void bzero (void* ptr, size_t n);
+; void* __fastcall__ memset (void* ptr, int c, size_t n);
+; void* __fastcall__ _bzero (void* ptr, size_t n);
+; void __fastcall__ bzero (void* ptr, size_t n);
 ;
 ; Ullrich von Bassewitz, 29.05.1998
+; Performance increase (about 20%) by
+; Christian Krueger, 12.09.2009
 ;
 ; NOTE: bzero will return it's first argument as memset does. It is no problem
 ;       to declare the return value as void, since it may be ignored. _bzero
@@ -15,57 +17,79 @@
 
  	.export		_memset, _bzero, __bzero
 	.import		popax
-       	.importzp	sp, ptr1, ptr2, ptr3, tmp1
+       	.importzp	sp, ptr1, ptr2, ptr3
 
 _bzero:
 __bzero:
         sta     ptr3
         stx     ptr3+1          ; Save n
-        lda     #0		; Fill with zeros
+        ldx     #0		; Fill with zeros
         beq     common
-	
+
 _memset:
  	sta	ptr3		; Save n
  	stx	ptr3+1
  	jsr	popax  	 	; Get c
+	tax
 
 ; Common stuff for memset and bzero from here
 
-common:	sta	tmp1		; Save the fill value
-        ldy     #1
+common:				; Fill value is in X!
+	ldy     #1
         lda     (sp),y
-        tax
-        dey
+        sta	ptr1+1		; save high byte of ptr
+        dey			; Y = 0
         lda     (sp),y          ; Get ptr
  	sta	ptr1
- 	stx	ptr1+1 		; Save work copy
 
-       	lda	tmp1            ; Load fill value
-	ldy	#0
+	lsr	ptr3+1		; divide number of
+	ror	ptr3		; bytes by two to increase
+	bcc	evenCount	; speed (ptr3 = ptr3/2)
+oddCount:
+				; y is still 0 here
+	txa			; restore fill value
+	sta	(ptr1),y	; save value and increase
+	inc	ptr1		; dest. pointer
+	bne	evenCount
+	inc	ptr1+1
+evenCount:
+	lda	ptr1		; build second pointer section
+	clc
+	adc	ptr3		; ptr2 = ptr1 + (length/2) <- ptr3
+	sta	ptr2
+	lda	ptr1+1
+	adc	ptr3+1
+	sta	ptr2+1
+
+	txa			; restore fill value
 	ldx	ptr3+1  	; Get high byte of n
        	beq    	L2		; Jump if zero
 
-; Set 256 byte blocks
-
+; Set 256/512 byte blocks
+				; y is still 0 here
 L1:    	.repeat 2		; Unroll this a bit to make it faster
-	sta	(ptr1),y	; Set one byte
-  	iny
+	sta	(ptr1),y	; Set byte in lower section
+  	sta	(ptr2),y	; Set byte in upper section
+	iny
 	.endrepeat
   	bne	L1
 	inc	ptr1+1
+	inc	ptr2+1
        	dex			; Next 256 byte block
 	bne	L1		; Repeat if any
 
 ; Set the remaining bytes if any
 
-L2:    	ldx	ptr3		; Get the low byte of n
-  	beq	L9		; Low byte is zero
-
-L3:    	sta    	(ptr1),y       	; Set one byte
-  	iny
-       	dex			; Done?
-  	bne	L3
-
-L9:    	jmp     popax           ; Pop ptr and return as result
+L2:    	ldy	ptr3		; Get the low byte of n
+  	bne	L3		; something to set?
+  	jmp	popax		; no -> Pop ptr and return as result
 
+L3a:	sta	(ptr1),y	; set bytes in low
+	sta	(ptr2),y	; and high section
+L3:    	dey
+	bne	L3a
+	sta    	(ptr1),y       	; Set remaining byte(s)
+  	sta	(ptr2),y
+  	jmp     popax           ; Pop ptr and return as result
 
+                
-- 
2.39.5