From 0e91f15ea85fc6cf5a0c91d8691d92db0472f79c Mon Sep 17 00:00:00 2001 From: uz Date: Sun, 20 Sep 2009 14:32:25 +0000 Subject: [PATCH] Improved memset/memcpy/memmove functions by Christian Krueger. git-svn-id: svn://svn.cc65.org/cc65/trunk@4200 b7a2c559-68d2-44c3-8de9-860c34a00d81 --- libsrc/common/memcpy.s | 90 +++++++++++++++++++++++------------------ libsrc/common/memmove.s | 69 +++++++++++++++++++------------ libsrc/common/memset.s | 76 ++++++++++++++++++++++------------ 3 files changed, 143 insertions(+), 92 deletions(-) diff --git a/libsrc/common/memcpy.s b/libsrc/common/memcpy.s index ff2c4bdde..d432ffa0e 100644 --- a/libsrc/common/memcpy.s +++ b/libsrc/common/memcpy.s @@ -1,5 +1,7 @@ ; ; Ullrich von Bassewitz, 2003-08-20 +; Performance increase (about 20%) by +; Christian Krueger, 2009-09-13 ; ; void* __fastcall__ memcpy (void* dest, const void* src, size_t n); ; @@ -10,61 +12,69 @@ .export _memcpy, memcpy_upwards, memcpy_getparams .import popax - .importzp ptr1, ptr2, ptr3, tmp1 + .importzp sp, ptr1, ptr2, ptr3 ; ---------------------------------------------------------------------- _memcpy: jsr memcpy_getparams -memcpy_upwards: - ldy #0 - ldx ptr3 ; Get low counter byte +memcpy_upwards: ; assert Y = 0 + ldx ptr3+1 ; Get high byte of n + beq L2 ; Jump if zero -; Copy loop +L1: .repeat 2 ; Unroll this a bit to make it faster... + lda (ptr1),Y ; copy a byte + sta (ptr2),Y + iny + .endrepeat + bne L1 + inc ptr1+1 + inc ptr2+1 + dex ; Next 256 byte block + bne L1 ; Repeat if any -@L1: inx ; Bump low counter byte - beq @L3 ; Jump on overflow -@L2: lda (ptr1),y - sta (ptr2),y - iny - bne @L1 - inc ptr1+1 ; Bump pointers - inc ptr2+1 - bne @L1 ; Branch always -@L3: inc ptr3+1 ; Bump high counter byte - bne @L2 + ; the following section could be 10% faster if we were able to copy + ; back to front - unfortunately we are forced to copy strict from + ; low to high since this function is also used for + ; memmove and blocks could be overlapping! + ; { +L2: ; assert Y = 0 + ldx ptr3 ; Get the low byte of n + beq done ; something to copy -; Done. The low byte of dest is still in ptr2 +L3: lda (ptr1),Y ; copy a byte + sta (ptr2),Y + iny + dex + bne L3 -done: lda ptr2 - ldx tmp1 ; get function result (dest) - rts + ; } + +done: jmp popax ; Pop ptr and return as result ; ---------------------------------------------------------------------- ; Get the parameters from stack as follows: ; -; -(size-1) --> ptr3 +; size --> ptr3 ; src --> ptr1 ; dest --> ptr2 -; high(dest) --> tmp1 -; -; dest is returned in a/x. - -memcpy_getparams: - eor #$FF - sta ptr3 - txa - eor #$FF - sta ptr3+1 ; Save -(size-1) - - jsr popax ; src - sta ptr1 - stx ptr1+1 +; First argument (dest) will remain on stack and is returned in a/x! - jsr popax ; dest - sta ptr2 - stx ptr2+1 ; Save work copy - stx tmp1 ; Save for function result +memcpy_getparams: ; IMPORTANT! Function has to leave with Y=0! + sta ptr3 + stx ptr3+1 ; save n to ptr3 - rts + jsr popax + sta ptr1 + stx ptr1+1 ; save src to ptr1 + ; save dest to ptr2 + ldy #1 ; (direct stack access is three cycles faster + ; (total cycle count with return)) + lda (sp),y + tax + stx ptr2+1 ; save high byte of ptr2 + dey ; Y = 0 + lda (sp),y ; Get ptr2 low + sta ptr2 + rts diff --git a/libsrc/common/memmove.s b/libsrc/common/memmove.s index f344f9df8..983b97277 100644 --- a/libsrc/common/memmove.s +++ b/libsrc/common/memmove.s @@ -1,5 +1,7 @@ ; ; Ullrich von Bassewitz, 2003-08-20 +; Performance increase (about 20%) by +; Christian Krueger, 2009-09-13 ; ; void* __fastcall__ memmove (void* dest, const void* src, size_t size); ; @@ -7,7 +9,7 @@ ; .export _memmove - .import memcpy_getparams, memcpy_upwards + .import memcpy_getparams, memcpy_upwards, popax .importzp ptr1, ptr2, ptr3, ptr4, tmp1 .macpack generic @@ -15,9 +17,6 @@ ; ---------------------------------------------------------------------- _memmove: - sta ptr4 - stx ptr4+1 ; Size -> ptr4 - jsr memcpy_getparams ; Check for the copy direction. If dest < src, we must copy upwards (start at @@ -33,35 +32,53 @@ _memmove: ; Copy downwards. Adjust the pointers to the end of the memory regions. lda ptr1+1 - add ptr4+1 + add ptr3+1 sta ptr1+1 lda ptr2+1 - add ptr4+1 + add ptr3+1 sta ptr2+1 -; Load the low offset into Y, and the counter low byte into X. - - ldy ptr4 - ldx ptr3 - jmp @L2 - -; Copy loop - -@L1: dey +; handle fractions of a page size first + + ldy ptr3 ; count, low byte + bne @entry ; something to copy? + beq PageSizeCopy ; here like bra... + +@copyByte: + lda (ptr1),y + sta (ptr2),y +@entry: + dey + bne @copyByte + lda (ptr1),y ; copy remaining byte + sta (ptr2),y + +PageSizeCopy: ; assert Y = 0 + ldx ptr3+1 ; number of pages + beq done ; none? -> done + +@initBase: + dec ptr1+1 ; adjust base... + dec ptr2+1 + dey ; in entry case: 0 -> FF + lda (ptr1),y ; need to copy this 'intro byte' + sta (ptr2),y ; to 'land' later on Y=0! (as a result of the '.repeat'-block!) + dey ; FF ->FE +@copyBytes: + .repeat 2 ; Unroll this a bit to make it faster... lda (ptr1),y sta (ptr2),y - -@L2: inx ; Bump counter low byte - bne @L1 - dec ptr1+1 - dec ptr2+1 - inc ptr3+1 ; Bump counter high byte - bne @L1 + dey + .endrepeat +@copyEntry: ; in entry case: 0 -> FF + bne @copyBytes + lda (ptr1),y ; Y = 0, copy last byte + sta (ptr2),y + dex ; one page to copy less + bne @initBase ; still a page to copy? ; Done, return dest -done: lda ptr2 - ldx tmp1 ; get function result (dest) - rts - +done: jmp popax ; Pop ptr and return as result + diff --git a/libsrc/common/memset.s b/libsrc/common/memset.s index 62c83fb5e..fcdbd98bc 100644 --- a/libsrc/common/memset.s +++ b/libsrc/common/memset.s @@ -1,9 +1,11 @@ ; -; void* memset (void* ptr, int c, size_t n); -; void* _bzero (void* ptr, size_t n); -; void bzero (void* ptr, size_t n); +; void* __fastcall__ memset (void* ptr, int c, size_t n); +; void* __fastcall__ _bzero (void* ptr, size_t n); +; void __fastcall__ bzero (void* ptr, size_t n); ; ; Ullrich von Bassewitz, 29.05.1998 +; Performance increase (about 20%) by +; Christian Krueger, 12.09.2009 ; ; NOTE: bzero will return it's first argument as memset does. It is no problem ; to declare the return value as void, since it may be ignored. _bzero @@ -15,57 +17,79 @@ .export _memset, _bzero, __bzero .import popax - .importzp sp, ptr1, ptr2, ptr3, tmp1 + .importzp sp, ptr1, ptr2, ptr3 _bzero: __bzero: sta ptr3 stx ptr3+1 ; Save n - lda #0 ; Fill with zeros + ldx #0 ; Fill with zeros beq common - + _memset: sta ptr3 ; Save n stx ptr3+1 jsr popax ; Get c + tax ; Common stuff for memset and bzero from here -common: sta tmp1 ; Save the fill value - ldy #1 +common: ; Fill value is in X! + ldy #1 lda (sp),y - tax - dey + sta ptr1+1 ; save high byte of ptr + dey ; Y = 0 lda (sp),y ; Get ptr sta ptr1 - stx ptr1+1 ; Save work copy - lda tmp1 ; Load fill value - ldy #0 + lsr ptr3+1 ; divide number of + ror ptr3 ; bytes by two to increase + bcc evenCount ; speed (ptr3 = ptr3/2) +oddCount: + ; y is still 0 here + txa ; restore fill value + sta (ptr1),y ; save value and increase + inc ptr1 ; dest. pointer + bne evenCount + inc ptr1+1 +evenCount: + lda ptr1 ; build second pointer section + clc + adc ptr3 ; ptr2 = ptr1 + (length/2) <- ptr3 + sta ptr2 + lda ptr1+1 + adc ptr3+1 + sta ptr2+1 + + txa ; restore fill value ldx ptr3+1 ; Get high byte of n beq L2 ; Jump if zero -; Set 256 byte blocks - +; Set 256/512 byte blocks + ; y is still 0 here L1: .repeat 2 ; Unroll this a bit to make it faster - sta (ptr1),y ; Set one byte - iny + sta (ptr1),y ; Set byte in lower section + sta (ptr2),y ; Set byte in upper section + iny .endrepeat bne L1 inc ptr1+1 + inc ptr2+1 dex ; Next 256 byte block bne L1 ; Repeat if any ; Set the remaining bytes if any -L2: ldx ptr3 ; Get the low byte of n - beq L9 ; Low byte is zero - -L3: sta (ptr1),y ; Set one byte - iny - dex ; Done? - bne L3 - -L9: jmp popax ; Pop ptr and return as result +L2: ldy ptr3 ; Get the low byte of n + bne L3 ; something to set? + jmp popax ; no -> Pop ptr and return as result +L3a: sta (ptr1),y ; set bytes in low + sta (ptr2),y ; and high section +L3: dey + bne L3a + sta (ptr1),y ; Set remaining byte(s) + sta (ptr2),y + jmp popax ; Pop ptr and return as result + -- 2.39.5