;
; Ullrich von Bassewitz, 2003-08-20
+; Performance increase (about 20%) by
+; Christian Krueger, 2009-09-13
;
; void* __fastcall__ memcpy (void* dest, const void* src, size_t n);
;
.export _memcpy, memcpy_upwards, memcpy_getparams
.import popax
- .importzp ptr1, ptr2, ptr3, tmp1
+ .importzp sp, ptr1, ptr2, ptr3
; ----------------------------------------------------------------------
_memcpy:
jsr memcpy_getparams
-memcpy_upwards:
- ldy #0
- ldx ptr3 ; Get low counter byte
+memcpy_upwards: ; assert Y = 0
+ ldx ptr3+1 ; Get high byte of n
+ beq L2 ; Jump if zero
-; Copy loop
+L1: .repeat 2 ; Unroll this a bit to make it faster...
+ lda (ptr1),Y ; copy a byte
+ sta (ptr2),Y
+ iny
+ .endrepeat
+ bne L1
+ inc ptr1+1
+ inc ptr2+1
+ dex ; Next 256 byte block
+ bne L1 ; Repeat if any
-@L1: inx ; Bump low counter byte
- beq @L3 ; Jump on overflow
-@L2: lda (ptr1),y
- sta (ptr2),y
- iny
- bne @L1
- inc ptr1+1 ; Bump pointers
- inc ptr2+1
- bne @L1 ; Branch always
-@L3: inc ptr3+1 ; Bump high counter byte
- bne @L2
+ ; the following section could be 10% faster if we were able to copy
+ ; back to front - unfortunately we are forced to copy strict from
+ ; low to high since this function is also used for
+ ; memmove and blocks could be overlapping!
+ ; {
+L2: ; assert Y = 0
+ ldx ptr3 ; Get the low byte of n
+ beq done ; something to copy
-; Done. The low byte of dest is still in ptr2
+L3: lda (ptr1),Y ; copy a byte
+ sta (ptr2),Y
+ iny
+ dex
+ bne L3
-done: lda ptr2
- ldx tmp1 ; get function result (dest)
- rts
+ ; }
+
+done: jmp popax ; Pop ptr and return as result
; ----------------------------------------------------------------------
; Get the parameters from stack as follows:
;
-; -(size-1) --> ptr3
+; size --> ptr3
; src --> ptr1
; dest --> ptr2
-; high(dest) --> tmp1
-;
-; dest is returned in a/x.
-
-memcpy_getparams:
- eor #$FF
- sta ptr3
- txa
- eor #$FF
- sta ptr3+1 ; Save -(size-1)
-
- jsr popax ; src
- sta ptr1
- stx ptr1+1
+; First argument (dest) will remain on stack and is returned in a/x!
- jsr popax ; dest
- sta ptr2
- stx ptr2+1 ; Save work copy
- stx tmp1 ; Save for function result
+memcpy_getparams: ; IMPORTANT! Function has to leave with Y=0!
+ sta ptr3
+ stx ptr3+1 ; save n to ptr3
- rts
+ jsr popax
+ sta ptr1
+ stx ptr1+1 ; save src to ptr1
+ ; save dest to ptr2
+ ldy #1 ; (direct stack access is three cycles faster
+ ; (total cycle count with return))
+ lda (sp),y
+ tax
+ stx ptr2+1 ; save high byte of ptr2
+ dey ; Y = 0
+ lda (sp),y ; Get ptr2 low
+ sta ptr2
+ rts
;
; Ullrich von Bassewitz, 2003-08-20
+; Performance increase (about 20%) by
+; Christian Krueger, 2009-09-13
;
; void* __fastcall__ memmove (void* dest, const void* src, size_t size);
;
;
.export _memmove
- .import memcpy_getparams, memcpy_upwards
+ .import memcpy_getparams, memcpy_upwards, popax
.importzp ptr1, ptr2, ptr3, ptr4, tmp1
.macpack generic
; ----------------------------------------------------------------------
_memmove:
- sta ptr4
- stx ptr4+1 ; Size -> ptr4
-
jsr memcpy_getparams
; Check for the copy direction. If dest < src, we must copy upwards (start at
; Copy downwards. Adjust the pointers to the end of the memory regions.
lda ptr1+1
- add ptr4+1
+ add ptr3+1
sta ptr1+1
lda ptr2+1
- add ptr4+1
+ add ptr3+1
sta ptr2+1
-; Load the low offset into Y, and the counter low byte into X.
-
- ldy ptr4
- ldx ptr3
- jmp @L2
-
-; Copy loop
-
-@L1: dey
+; handle fractions of a page size first
+
+ ldy ptr3 ; count, low byte
+ bne @entry ; something to copy?
+ beq PageSizeCopy ; here like bra...
+
+@copyByte:
+ lda (ptr1),y
+ sta (ptr2),y
+@entry:
+ dey
+ bne @copyByte
+ lda (ptr1),y ; copy remaining byte
+ sta (ptr2),y
+
+PageSizeCopy: ; assert Y = 0
+ ldx ptr3+1 ; number of pages
+ beq done ; none? -> done
+
+@initBase:
+ dec ptr1+1 ; adjust base...
+ dec ptr2+1
+ dey ; in entry case: 0 -> FF
+ lda (ptr1),y ; need to copy this 'intro byte'
+ sta (ptr2),y ; to 'land' later on Y=0! (as a result of the '.repeat'-block!)
+ dey ; FF ->FE
+@copyBytes:
+ .repeat 2 ; Unroll this a bit to make it faster...
lda (ptr1),y
sta (ptr2),y
-
-@L2: inx ; Bump counter low byte
- bne @L1
- dec ptr1+1
- dec ptr2+1
- inc ptr3+1 ; Bump counter high byte
- bne @L1
+ dey
+ .endrepeat
+@copyEntry: ; in entry case: 0 -> FF
+ bne @copyBytes
+ lda (ptr1),y ; Y = 0, copy last byte
+ sta (ptr2),y
+ dex ; one page to copy less
+ bne @initBase ; still a page to copy?
; Done, return dest
-done: lda ptr2
- ldx tmp1 ; get function result (dest)
- rts
-
+done: jmp popax ; Pop ptr and return as result
+
;
-; void* memset (void* ptr, int c, size_t n);
-; void* _bzero (void* ptr, size_t n);
-; void bzero (void* ptr, size_t n);
+; void* __fastcall__ memset (void* ptr, int c, size_t n);
+; void* __fastcall__ _bzero (void* ptr, size_t n);
+; void __fastcall__ bzero (void* ptr, size_t n);
;
; Ullrich von Bassewitz, 29.05.1998
+; Performance increase (about 20%) by
+; Christian Krueger, 12.09.2009
;
; NOTE: bzero will return it's first argument as memset does. It is no problem
; to declare the return value as void, since it may be ignored. _bzero
.export _memset, _bzero, __bzero
.import popax
- .importzp sp, ptr1, ptr2, ptr3, tmp1
+ .importzp sp, ptr1, ptr2, ptr3
_bzero:
__bzero:
sta ptr3
stx ptr3+1 ; Save n
- lda #0 ; Fill with zeros
+ ldx #0 ; Fill with zeros
beq common
-
+
_memset:
sta ptr3 ; Save n
stx ptr3+1
jsr popax ; Get c
+ tax
; Common stuff for memset and bzero from here
-common: sta tmp1 ; Save the fill value
- ldy #1
+common: ; Fill value is in X!
+ ldy #1
lda (sp),y
- tax
- dey
+ sta ptr1+1 ; save high byte of ptr
+ dey ; Y = 0
lda (sp),y ; Get ptr
sta ptr1
- stx ptr1+1 ; Save work copy
- lda tmp1 ; Load fill value
- ldy #0
+ lsr ptr3+1 ; divide number of
+ ror ptr3 ; bytes by two to increase
+ bcc evenCount ; speed (ptr3 = ptr3/2)
+oddCount:
+ ; y is still 0 here
+ txa ; restore fill value
+ sta (ptr1),y ; save value and increase
+ inc ptr1 ; dest. pointer
+ bne evenCount
+ inc ptr1+1
+evenCount:
+ lda ptr1 ; build second pointer section
+ clc
+ adc ptr3 ; ptr2 = ptr1 + (length/2) <- ptr3
+ sta ptr2
+ lda ptr1+1
+ adc ptr3+1
+ sta ptr2+1
+
+ txa ; restore fill value
ldx ptr3+1 ; Get high byte of n
beq L2 ; Jump if zero
-; Set 256 byte blocks
-
+; Set 256/512 byte blocks
+ ; y is still 0 here
L1: .repeat 2 ; Unroll this a bit to make it faster
- sta (ptr1),y ; Set one byte
- iny
+ sta (ptr1),y ; Set byte in lower section
+ sta (ptr2),y ; Set byte in upper section
+ iny
.endrepeat
bne L1
inc ptr1+1
+ inc ptr2+1
dex ; Next 256 byte block
bne L1 ; Repeat if any
; Set the remaining bytes if any
-L2: ldx ptr3 ; Get the low byte of n
- beq L9 ; Low byte is zero
-
-L3: sta (ptr1),y ; Set one byte
- iny
- dex ; Done?
- bne L3
-
-L9: jmp popax ; Pop ptr and return as result
+L2: ldy ptr3 ; Get the low byte of n
+ bne L3 ; something to set?
+ jmp popax ; no -> Pop ptr and return as result
+L3a: sta (ptr1),y ; set bytes in low
+ sta (ptr2),y ; and high section
+L3: dey
+ bne L3a
+ sta (ptr1),y ; Set remaining byte(s)
+ sta (ptr2),y
+ jmp popax ; Pop ptr and return as result
+