From: Alexey Brodkin Date: Wed, 24 Jan 2018 20:22:33 +0000 (+0300) Subject: arc: Get rid of handwritten string routines X-Git-Tag: v2018.05-rc1~10^2~25 X-Git-Url: https://git.sur5r.net/?a=commitdiff_plain;h=2178817c4a105ead044e054bf902f256e6d589c2;p=u-boot arc: Get rid of handwritten string routines U-Boot is a bit special piese of software because it is being only executed once on power-on as compared to operating system for example. That's why we don't care much about performance optimizations instead we're more concerned about size. And up-to-date compilers might produce much smaller code compared to performance-optimized routines copy-pasted from the Linux kernel. Here's an example: ------------------------------->8-------------------------- --- size_asm_strings.txt +++ size_c_strings.txt @@ -1,2 +1,2 @@ text data bss dec hex filename - 121260 3784 3308 128352 1f560 u-boot + 120448 3784 3308 127540 1f234 u-boot ------------------------------->8-------------------------- See we were able to shave off ~800 bytes of .text section. Also usage of string routines implemented in C gives us an ability to support more HW flavors for free: generated instructions will match our target as long as correct compiler option is used. Signed-off-by: Alexey Brodkin --- diff --git a/arch/arc/include/asm/string.h b/arch/arc/include/asm/string.h index 909129c333..8b13789179 100644 --- a/arch/arc/include/asm/string.h +++ b/arch/arc/include/asm/string.h @@ -1,27 +1 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. All rights reserved. - * - * SPDX-License-Identifier: GPL-2.0+ - */ -#ifndef __ASM_ARC_STRING_H -#define __ASM_ARC_STRING_H - -#define __HAVE_ARCH_MEMSET -#define __HAVE_ARCH_MEMCPY -#define __HAVE_ARCH_MEMCMP -#define __HAVE_ARCH_STRCHR -#define __HAVE_ARCH_STRCPY -#define __HAVE_ARCH_STRCMP -#define __HAVE_ARCH_STRLEN - -extern void *memset(void *ptr, int, __kernel_size_t); -extern void *memcpy(void *, const void *, __kernel_size_t); -extern void memzero(void *ptr, __kernel_size_t n); -extern int memcmp(const void *, const void *, __kernel_size_t); -extern char *strchr(const char *s, int c); -extern char *strcpy(char *dest, const char *src); -extern int strcmp(const char *cs, const char *ct); -extern __kernel_size_t strlen(const char *); - -#endif /* __ASM_ARC_STRING_H */ diff --git a/arch/arc/lib/Makefile b/arch/arc/lib/Makefile index 12097bf3be..6b7fb0fdff 100644 --- a/arch/arc/lib/Makefile +++ b/arch/arc/lib/Makefile @@ -10,13 +10,6 @@ obj-y += cache.o obj-y += cpu.o obj-y += interrupts.o obj-y += relocate.o -obj-y += strchr-700.o -obj-y += strcmp.o -obj-y += strcpy-700.o -obj-y += strlen.o -obj-y += memcmp.o -obj-y += memcpy-700.o -obj-y += memset.o obj-y += reset.o obj-y += ints_low.o obj-y += init_helpers.o diff --git a/arch/arc/lib/memcmp.S b/arch/arc/lib/memcmp.S deleted file mode 100644 index 87bccab51d..0000000000 --- a/arch/arc/lib/memcmp.S +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -#ifdef __LITTLE_ENDIAN__ -#define WORD2 r2 -#define SHIFT r3 -#else /* __BIG_ENDIAN__ */ -#define WORD2 r3 -#define SHIFT r2 -#endif /* _ENDIAN__ */ - -.global memcmp -.align 4 -memcmp: - or %r12, %r0, %r1 - asl_s %r12, %r12, 30 - sub %r3, %r2, 1 - brls %r2, %r12, .Lbytewise - ld %r4, [%r0, 0] - ld %r5, [%r1, 0] - lsr.f %lp_count, %r3, 3 - lpne .Loop_end - ld_s WORD2, [%r0, 4] - ld_s %r12, [%r1, 4] - brne %r4, %r5, .Leven - ld.a %r4, [%r0, 8] - ld.a %r5, [%r1, 8] - brne WORD2, %r12, .Lodd - nop -.Loop_end: - asl_s SHIFT, SHIFT, 3 - bhs_s .Last_cmp - brne %r4, %r5, .Leven - ld %r4, [%r0, 4] - ld %r5, [%r1, 4] -#ifdef __LITTLE_ENDIAN__ - nop_s - /* one more load latency cycle */ -.Last_cmp: - xor %r0, %r4, %r5 - bset %r0, %r0, SHIFT - sub_s %r1, %r0, 1 - bic_s %r1, %r1, %r0 - norm %r1, %r1 - b.d .Leven_cmp - and %r1, %r1, 24 -.Leven: - xor %r0, %r4, %r5 - sub_s %r1, %r0, 1 - bic_s %r1, %r1, %r0 - norm %r1, %r1 - /* slow track insn */ - and %r1, %r1, 24 -.Leven_cmp: - asl %r2, %r4, %r1 - asl %r12, %r5, %r1 - lsr_s %r2, %r2, 1 - lsr_s %r12, %r12, 1 - j_s.d [%blink] - sub %r0, %r2, %r12 - .balign 4 -.Lodd: - xor %r0, WORD2, %r12 - sub_s %r1, %r0, 1 - bic_s %r1, %r1, %r0 - norm %r1, %r1 - /* slow track insn */ - and %r1, %r1, 24 - asl_s %r2, %r2, %r1 - asl_s %r12, %r12, %r1 - lsr_s %r2, %r2, 1 - lsr_s %r12, %r12, 1 - j_s.d [%blink] - sub %r0, %r2, %r12 -#else /* __BIG_ENDIAN__ */ -.Last_cmp: - neg_s SHIFT, SHIFT - lsr %r4, %r4, SHIFT - lsr %r5, %r5, SHIFT - /* slow track insn */ -.Leven: - sub.f %r0, %r4, %r5 - mov.ne %r0, 1 - j_s.d [%blink] - bset.cs %r0, %r0, 31 -.Lodd: - cmp_s WORD2, %r12 - - mov_s %r0, 1 - j_s.d [%blink] - bset.cs %r0, %r0, 31 -#endif /* _ENDIAN__ */ - .balign 4 -.Lbytewise: - breq %r2, 0, .Lnil - ldb %r4, [%r0, 0] - ldb %r5, [%r1, 0] - lsr.f %lp_count, %r3 - lpne .Lbyte_end - ldb_s %r3, [%r0, 1] - ldb %r12, [%r1, 1] - brne %r4, %r5, .Lbyte_even - ldb.a %r4, [%r0, 2] - ldb.a %r5, [%r1, 2] - brne %r3, %r12, .Lbyte_odd - nop -.Lbyte_end: - bcc .Lbyte_even - brne %r4, %r5, .Lbyte_even - ldb_s %r3, [%r0, 1] - ldb_s %r12, [%r1, 1] -.Lbyte_odd: - j_s.d [%blink] - sub %r0, %r3, %r12 -.Lbyte_even: - j_s.d [%blink] - sub %r0, %r4, %r5 -.Lnil: - j_s.d [%blink] - mov %r0, 0 diff --git a/arch/arc/lib/memcpy-700.S b/arch/arc/lib/memcpy-700.S deleted file mode 100644 index 51dd73ab8f..0000000000 --- a/arch/arc/lib/memcpy-700.S +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -.global memcpy -.align 4 -memcpy: - or %r3, %r0, %r1 - asl_s %r3, %r3, 30 - mov_s %r5, %r0 - brls.d %r2, %r3, .Lcopy_bytewise - sub.f %r3, %r2, 1 - ld_s %r12, [%r1, 0] - asr.f %lp_count, %r3, 3 - bbit0.d %r3, 2, .Lnox4 - bmsk_s %r2, %r2, 1 - st.ab %r12, [%r5, 4] - ld.a %r12, [%r1, 4] -.Lnox4: - lppnz .Lendloop - ld_s %r3, [%r1, 4] - st.ab %r12, [%r5, 4] - ld.a %r12, [%r1, 8] - st.ab %r3, [%r5, 4] -.Lendloop: - breq %r2, 0, .Last_store - ld %r3, [%r5, 0] -#ifdef __LITTLE_ENDIAN__ - add3 %r2, -1, %r2 - /* uses long immediate */ - xor_s %r12, %r12, %r3 - bmsk %r12, %r12, %r2 - xor_s %r12, %r12, %r3 -#else /* __BIG_ENDIAN__ */ - sub3 %r2, 31, %r2 - /* uses long immediate */ - xor_s %r3, %r3, %r12 - bmsk %r3, %r3, %r2 - xor_s %r12, %r12, %r3 -#endif /* _ENDIAN__ */ -.Last_store: - j_s.d [%blink] - st %r12, [%r5, 0] - - .balign 4 -.Lcopy_bytewise: - jcs [%blink] - ldb_s %r12, [%r1, 0] - lsr.f %lp_count, %r3 - bhs_s .Lnox1 - stb.ab %r12, [%r5, 1] - ldb.a %r12, [%r1, 1] -.Lnox1: - lppnz .Lendbloop - ldb_s %r3, [%r1, 1] - stb.ab %r12, [%r5, 1] - ldb.a %r12, [%r1, 2] - stb.ab %r3, [%r5, 1] -.Lendbloop: - j_s.d [%blink] - stb %r12, [%r5, 0] diff --git a/arch/arc/lib/memset.S b/arch/arc/lib/memset.S deleted file mode 100644 index 017e8af0e8..0000000000 --- a/arch/arc/lib/memset.S +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */ - -.global memset -.align 4 -memset: - mov_s %r4, %r0 - or %r12, %r0, %r2 - bmsk.f %r12, %r12, 1 - extb_s %r1, %r1 - asl %r3, %r1, 8 - beq.d .Laligned - or_s %r1, %r1, %r3 - brls %r2, SMALL, .Ltiny - add %r3, %r2, %r0 - stb %r1, [%r3, -1] - bclr_s %r3, %r3, 0 - stw %r1, [%r3, -2] - bmsk.f %r12, %r0, 1 - add_s %r2, %r2, %r12 - sub.ne %r2, %r2, 4 - stb.ab %r1, [%r4, 1] - and %r4, %r4, -2 - stw.ab %r1, [%r4, 2] - and %r4, %r4, -4 - - .balign 4 -.Laligned: - asl %r3, %r1, 16 - lsr.f %lp_count, %r2, 2 - or_s %r1, %r1, %r3 - lpne .Loop_end - st.ab %r1, [%r4, 4] -.Loop_end: - j_s [%blink] - - .balign 4 -.Ltiny: - mov.f %lp_count, %r2 - lpne .Ltiny_end - stb.ab %r1, [%r4, 1] -.Ltiny_end: - j_s [%blink] - -/* - * memzero: @r0 = mem, @r1 = size_t - * memset: @r0 = mem, @r1 = char, @r2 = size_t - */ - -.global memzero -.align 4 -memzero: - /* adjust bzero args to memset args */ - mov %r2, %r1 - mov %r1, 0 - /* tail call so need to tinker with blink */ - b memset diff --git a/arch/arc/lib/strchr-700.S b/arch/arc/lib/strchr-700.S deleted file mode 100644 index 55fcc9fb00..0000000000 --- a/arch/arc/lib/strchr-700.S +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -/* - * ARC700 has a relatively long pipeline and branch prediction, so we want - * to avoid branches that are hard to predict. On the other hand, the - * presence of the norm instruction makes it easier to operate on whole - * words branch-free. - */ - -.global strchr -.align 4 -strchr: - extb_s %r1, %r1 - asl %r5, %r1, 8 - bmsk %r2, %r0, 1 - or %r5, %r5, %r1 - mov_s %r3, 0x01010101 - breq.d %r2, %r0, .Laligned - asl %r4, %r5, 16 - sub_s %r0, %r0, %r2 - asl %r7, %r2, 3 - ld_s %r2, [%r0] -#ifdef __LITTLE_ENDIAN__ - asl %r7, %r3, %r7 -#else /* __BIG_ENDIAN__ */ - lsr %r7, %r3, %r7 -#endif /* _ENDIAN__ */ - or %r5, %r5, %r4 - ror %r4, %r3 - sub %r12, %r2, %r7 - bic_s %r12, %r12, %r2 - and %r12, %r12, %r4 - brne.d %r12, 0, .Lfound0_ua - xor %r6, %r2, %r5 - ld.a %r2, [%r0, 4] - sub %r12, %r6, %r7 - bic %r12, %r12, %r6 -#ifdef __LITTLE_ENDIAN__ - and %r7, %r12, %r4 - /* For speed, we want this branch to be unaligned. */ - breq %r7, 0, .Loop - /* Likewise this one */ - b .Lfound_char -#else /* __BIG_ENDIAN__ */ - and %r12, %r12, %r4 - /* For speed, we want this branch to be unaligned. */ - breq %r12, 0, .Loop - lsr_s %r12, %r12, 7 - bic %r2, %r7, %r6 - b.d .Lfound_char_b - and_s %r2, %r2, %r12 -#endif /* _ENDIAN__ */ - /* We require this code address to be unaligned for speed... */ -.Laligned: - ld_s %r2, [%r0] - or %r5, %r5, %r4 - ror %r4, %r3 - /* ... so that this code address is aligned, for itself and ... */ -.Loop: - sub %r12, %r2, %r3 - bic_s %r12, %r12, %r2 - and %r12, %r12, %r4 - brne.d %r12, 0, .Lfound0 - xor %r6, %r2, %r5 - ld.a %r2, [%r0, 4] - sub %r12, %r6, %r3 - bic %r12, %r12, %r6 - and %r7, %r12, %r4 - breq %r7, 0, .Loop - /* - *... so that this branch is unaligned. - * Found searched-for character. - * r0 has already advanced to next word. - */ -#ifdef __LITTLE_ENDIAN__ - /* - * We only need the information about the first matching byte - * (i.e. the least significant matching byte) to be exact, - * hence there is no problem with carry effects. - */ -.Lfound_char: - sub %r3, %r7, 1 - bic %r3, %r3, %r7 - norm %r2, %r3 - sub_s %r0, %r0, 1 - asr_s %r2, %r2, 3 - j.d [%blink] - sub_s %r0, %r0, %r2 - - .balign 4 -.Lfound0_ua: - mov %r3, %r7 -.Lfound0: - sub %r3, %r6, %r3 - bic %r3, %r3, %r6 - and %r2, %r3, %r4 - or_s %r12, %r12, %r2 - sub_s %r3, %r12, 1 - bic_s %r3, %r3, %r12 - norm %r3, %r3 - add_s %r0, %r0, 3 - asr_s %r12, %r3, 3 - asl.f 0, %r2, %r3 - sub_s %r0, %r0, %r12 - j_s.d [%blink] - mov.pl %r0, 0 -#else /* __BIG_ENDIAN__ */ -.Lfound_char: - lsr %r7, %r7, 7 - - bic %r2, %r7, %r6 -.Lfound_char_b: - norm %r2, %r2 - sub_s %r0, %r0, 4 - asr_s %r2, %r2, 3 - j.d [%blink] - add_s %r0, %r0, %r2 - -.Lfound0_ua: - mov_s %r3, %r7 -.Lfound0: - asl_s %r2, %r2, 7 - or %r7, %r6, %r4 - bic_s %r12, %r12, %r2 - sub %r2, %r7, %r3 - or %r2, %r2, %r6 - bic %r12, %r2, %r12 - bic.f %r3, %r4, %r12 - norm %r3, %r3 - - add.pl %r3, %r3, 1 - asr_s %r12, %r3, 3 - asl.f 0, %r2, %r3 - add_s %r0, %r0, %r12 - j_s.d [%blink] - mov.mi %r0, 0 -#endif /* _ENDIAN__ */ diff --git a/arch/arc/lib/strcmp.S b/arch/arc/lib/strcmp.S deleted file mode 100644 index 8cb7d2f18c..0000000000 --- a/arch/arc/lib/strcmp.S +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -/* - * This is optimized primarily for the ARC700. - * It would be possible to speed up the loops by one cycle / word - * respective one cycle / byte by forcing double source 1 alignment, unrolling - * by a factor of two, and speculatively loading the second word / byte of - * source 1; however, that would increase the overhead for loop setup / finish, - * and strcmp might often terminate early. - */ - -.global strcmp -.align 4 -strcmp: - or %r2, %r0, %r1 - bmsk_s %r2, %r2, 1 - brne %r2, 0, .Lcharloop - mov_s %r12, 0x01010101 - ror %r5, %r12 -.Lwordloop: - ld.ab %r2, [%r0, 4] - ld.ab %r3, [%r1, 4] - nop_s - sub %r4, %r2, %r12 - bic %r4, %r4, %r2 - and %r4, %r4, %r5 - brne %r4, 0, .Lfound0 - breq %r2 ,%r3, .Lwordloop -#ifdef __LITTLE_ENDIAN__ - xor %r0, %r2, %r3 /* mask for difference */ - sub_s %r1, %r0, 1 - bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ - sub %r1, %r5, %r0 - xor %r0, %r5, %r1 /* mask for least significant difference byte */ - and_s %r2, %r2, %r0 - and_s %r3, %r3, %r0 -#endif /* _ENDIAN__ */ - cmp_s %r2, %r3 - mov_s %r0, 1 - j_s.d [%blink] - bset.lo %r0, %r0, 31 - - .balign 4 -#ifdef __LITTLE_ENDIAN__ -.Lfound0: - xor %r0, %r2, %r3 /* mask for difference */ - or %r0, %r0, %r4 /* or in zero indicator */ - sub_s %r1, %r0, 1 - bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ - sub %r1, %r5, %r0 - xor %r0, %r5, %r1 /* mask for least significant difference byte */ - and_s %r2, %r2, %r0 - and_s %r3, %r3, %r0 - sub.f %r0, %r2, %r3 - mov.hi %r0, 1 - j_s.d [%blink] - bset.lo %r0, %r0, 31 -#else /* __BIG_ENDIAN__ */ - /* - * The zero-detection above can mis-detect 0x01 bytes as zeroes - * because of carry-propagateion from a lower significant zero byte. - * We can compensate for this by checking that bit0 is zero. - * This compensation is not necessary in the step where we - * get a low estimate for r2, because in any affected bytes - * we already have 0x00 or 0x01, which will remain unchanged - * when bit 7 is cleared. - */ - .balign 4 -.Lfound0: - lsr %r0, %r4, 8 - lsr_s %r1, %r2 - bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */ - bic_s %r0, %r0, %r1 /* */ - or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */ - cmp_s %r3, %r2 /* ... be independent of trailing garbage */ - or_s %r2, %r2, %r0 /* likewise for r3 > r2 */ - bic_s %r3, %r3, %r0 - rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */ - cmp_s %r2, %r3 - j_s.d [%blink] - bset.lo %r0, %r0, 31 -#endif /* _ENDIAN__ */ - - .balign 4 -.Lcharloop: - ldb.ab %r2,[%r0,1] - ldb.ab %r3,[%r1,1] - nop_s - breq %r2, 0, .Lcmpend - breq %r2, %r3, .Lcharloop -.Lcmpend: - j_s.d [%blink] - sub %r0, %r2, %r3 diff --git a/arch/arc/lib/strcpy-700.S b/arch/arc/lib/strcpy-700.S deleted file mode 100644 index 41bb53e501..0000000000 --- a/arch/arc/lib/strcpy-700.S +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -/* - * If dst and src are 4 byte aligned, copy 8 bytes at a time. - * If the src is 4, but not 8 byte aligned, we first read 4 bytes to get - * it 8 byte aligned. Thus, we can do a little read-ahead, without - * dereferencing a cache line that we should not touch. - * Note that short and long instructions have been scheduled to avoid - * branch stalls. - * The beq_s to r3z could be made unaligned & long to avoid a stall - * there, but it is not likely to be taken often, and it would also be likely - * to cost an unaligned mispredict at the next call. - */ - -.global strcpy -.align 4 -strcpy: - or %r2, %r0, %r1 - bmsk_s %r2, %r2, 1 - brne.d %r2, 0, charloop - mov_s %r10, %r0 - ld_s %r3, [%r1, 0] - mov %r8, 0x01010101 - bbit0.d %r1, 2, loop_start - ror %r12, %r8 - sub %r2, %r3, %r8 - bic_s %r2, %r2, %r3 - tst_s %r2,%r12 - bne r3z - mov_s %r4,%r3 - .balign 4 -loop: - ld.a %r3, [%r1, 4] - st.ab %r4, [%r10, 4] -loop_start: - ld.a %r4, [%r1, 4] - sub %r2, %r3, %r8 - bic_s %r2, %r2, %r3 - tst_s %r2, %r12 - bne_s r3z - st.ab %r3, [%r10, 4] - sub %r2, %r4, %r8 - bic %r2, %r2, %r4 - tst %r2, %r12 - beq loop - mov_s %r3, %r4 -#ifdef __LITTLE_ENDIAN__ -r3z: bmsk.f %r1, %r3, 7 - lsr_s %r3, %r3, 8 -#else /* __BIG_ENDIAN__ */ -r3z: lsr.f %r1, %r3, 24 - asl_s %r3, %r3, 8 -#endif /* _ENDIAN__ */ - bne.d r3z - stb.ab %r1, [%r10, 1] - j_s [%blink] - - .balign 4 -charloop: - ldb.ab %r3, [%r1, 1] - brne.d %r3, 0, charloop - stb.ab %r3, [%r10, 1] - j [%blink] diff --git a/arch/arc/lib/strlen.S b/arch/arc/lib/strlen.S deleted file mode 100644 index 666e22c0d5..0000000000 --- a/arch/arc/lib/strlen.S +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. - * - * SPDX-License-Identifier: GPL-2.0+ - */ - -.global strlen -.align 4 -strlen: - or %r3, %r0, 7 - ld %r2, [%r3, -7] - ld.a %r6, [%r3, -3] - mov %r4, 0x01010101 - /* uses long immediate */ -#ifdef __LITTLE_ENDIAN__ - asl_s %r1, %r0, 3 - btst_s %r0, 2 - asl %r7, %r4, %r1 - ror %r5, %r4 - sub %r1, %r2, %r7 - bic_s %r1, %r1, %r2 - mov.eq %r7, %r4 - sub %r12, %r6, %r7 - bic %r12, %r12, %r6 - or.eq %r12, %r12, %r1 - and %r12, %r12, %r5 - brne %r12, 0, .Learly_end -#else /* __BIG_ENDIAN__ */ - ror %r5, %r4 - btst_s %r0, 2 - mov_s %r1, 31 - sub3 %r7, %r1, %r0 - sub %r1, %r2, %r4 - bic_s %r1, %r1, %r2 - bmsk %r1, %r1, %r7 - sub %r12, %r6, %r4 - bic %r12, %r12, %r6 - bmsk.ne %r12, %r12, %r7 - or.eq %r12, %r12, %r1 - and %r12, %r12, %r5 - brne %r12, 0, .Learly_end -#endif /* _ENDIAN__ */ - -.Loop: - ld_s %r2, [%r3, 4] - ld.a %r6, [%r3, 8] - /* stall for load result */ - sub %r1, %r2, %r4 - bic_s %r1, %r1, %r2 - sub %r12, %r6, %r4 - bic %r12, %r12, %r6 - or %r12, %r12, %r1 - and %r12, %r12, %r5 - breq %r12, 0, .Loop -.Lend: - and.f %r1, %r1, %r5 - sub.ne %r3, %r3, 4 - mov.eq %r1, %r12 -#ifdef __LITTLE_ENDIAN__ - sub_s %r2, %r1, 1 - bic_s %r2, %r2, %r1 - norm %r1, %r2 - sub_s %r0, %r0, 3 - lsr_s %r1, %r1, 3 - sub %r0, %r3, %r0 - j_s.d [%blink] - sub %r0, %r0, %r1 -#else /* __BIG_ENDIAN__ */ - lsr_s %r1, %r1, 7 - mov.eq %r2, %r6 - bic_s %r1, %r1, %r2 - norm %r1, %r1 - sub %r0, %r3, %r0 - lsr_s %r1, %r1, 3 - j_s.d [%blink] - add %r0, %r0, %r1 -#endif /* _ENDIAN */ -.Learly_end: - b.d .Lend - sub_s.ne %r1, %r1, %r1