They are smaller and faster because they take advantage of the pce CPU's block-copy instructions.
Also, made a small improvement to the common memmove(), so that it is similar to the pce version.
;
-; Ullrich von Bassewitz, 2003-08-20
-; Performance increase (about 20%) by
-; Christian Krueger, 2009-09-13
+; 2003-08-20, Ullrich von Bassewitz
+; 2009-09-13, Christian Krueger -- performance increase (about 20%)
+; 2015-10-23, Greg King
;
; void* __fastcall__ memmove (void* dest, const void* src, size_t size);
;
; low addresses and increase pointers), otherwise we must copy downwards
; (start at high addresses and decrease pointers).
- sec
- sbc ptr1
+ cmp ptr1
txa
sbc ptr1+1
jcc memcpy_upwards ; Branch if dest < src (upwards copy)
; Done, return dest
done: jmp popax ; Pop ptr and return as result
-
--- /dev/null
+;
+; This file, instead of "common/memcpy.s", will be assembled for the pce
+; target. This version is smaller and faster because it uses the HuC6280's
+; block-copy instructions.
+;
+; 2003-08-20, Ullrich von Bassewitz
+; 2015-10-11, Greg King
+;
+; void* __fastcall__ memcpy (void* dest, const void* src, size_t size);
+;
+; NOTE: This function contains entry points for memmove, which will resort
+; to memcpy for an incrementing copy. Don't change this module without looking
+; at "pce/memmove.s"!
+;
+
+ .export _memcpy
+ .export memcpy_increment, memcpy_transfer, memcpy_getparams
+
+ .import popax
+ .importzp sp, ptr1, ptr2, ptr3
+
+
+; The structure of the transfer instructions
+
+ .struct
+opcode .byte
+source .addr
+destination .addr
+length .word
+ .endstruct
+
+; ----------------------------------------------------------------------
+_memcpy:
+ jsr memcpy_getparams
+
+memcpy_increment:
+ ldy #$73 ; TII
+
+memcpy_transfer:
+ sty transfer+opcode
+
+ lda ptr1
+ ldx ptr1+1
+ sta transfer+source
+ stx transfer+source+1
+
+ lda ptr2
+ ldx ptr2+1
+ sta transfer+destination
+ stx transfer+destination+1
+
+ lda ptr3
+ ldx ptr3+1
+ sta transfer+length
+ stx transfer+length+1
+
+ jmp transfer
+
+; ----------------------------------------------------------------------
+; Get the parameters from the stack, as follows:
+;
+; size --> ptr3
+; src --> ptr1
+; dest --> ptr2
+;
+; The first argument (dest) will remain on the stack; and, is returned in .XA!
+
+memcpy_getparams:
+ sta ptr3
+ stx ptr3+1 ; save size
+
+ jsr popax
+ sta ptr1
+ stx ptr1+1 ; save src
+
+; (Direct stack access is four cycles faster [total cycle count].)
+
+ ldy #1 ; save dest
+ lda (sp),y ; get high byte
+ tax
+ lda (sp) ; get low byte
+ sta ptr2
+ stx ptr2+1
+ rts ; return dest address (for memmove)
+
+; ----------------------------------------------------------------------
+; The transfer instructions use inline arguments.
+; Therefore, we must build the instruction in the DATA segment.
+
+.data
+
+transfer:
+ tii $FFFF, $FFFF, $0001
+ jmp popax ; get pointer; and, return it as result
--- /dev/null
+;
+; This file, instead of "common/memmove.s", will be assembled for the pce
+; target. This version is smaller and faster because it uses the HuC6280's
+; block-copy instructions.
+;
+; 2003-08-20, Ullrich von Bassewitz
+; 2015-10-23, Greg King
+;
+; void* __fastcall__ memmove (void* dest, const void* src, size_t size);
+;
+; NOTE: This function uses entry points from "pce/memcpy.s"!
+;
+
+ .export _memmove
+
+ .import memcpy_getparams, memcpy_increment, memcpy_transfer
+ .importzp ptr1, ptr2, ptr3
+
+ .macpack generic
+ .macpack longbranch
+
+
+; ----------------------------------------------------------------------
+_memmove:
+ jsr memcpy_getparams
+
+; Check for the copy direction. If dest < src, we must copy downwards (start
+; at low addresses, and increase pointers); otherwise, we must copy upwards
+; (start at high addresses, and decrease pointers).
+
+ cmp ptr1
+ txa
+ sbc ptr1+1
+ jcc memcpy_increment ; Branch if dest < src
+
+; Copy decrementing; adjust the pointers to the end of the memory regions.
+
+ lda ptr1
+ add ptr3
+ sta ptr1
+ lda ptr1+1
+ adc ptr3+1
+ sta ptr1+1
+
+ lda ptr1 ; point to last byte of source
+ bne @L1
+ dec ptr1+1
+@L1: dec ptr1
+
+ lda ptr2
+ add ptr3
+ sta ptr2
+ lda ptr2+1
+ adc ptr3+1
+ sta ptr2+1
+
+ lda ptr2 ; point to last byte of target
+ bne @L2
+ dec ptr2+1
+@L2: dec ptr2
+
+ ldy #$C3 ; TDD
+ jmp memcpy_transfer