These are library functions used by ARC700 architecture.
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Francois Bedard <fbedard@synopsys.com>
Cc: Wolfgang Denk <wd@denx.de>
Cc: Heiko Schocher <hs@denx.de>
--- /dev/null
+#
+# Copyright (C) 2013-2014 Synopsys, Inc. All rights reserved.
+#
+# SPDX-License-Identifier: GPL-2.0+
+#
+
+obj-y += sections.o
+obj-y += relocate.o
+obj-y += strchr-700.o
+obj-y += strcmp.o
+obj-y += strcpy-700.o
+obj-y += strlen.o
+obj-y += memcmp.o
+obj-y += memcpy-700.o
+obj-y += memset.o
+obj-$(CONFIG_CMD_BOOTM) += bootm.o
--- /dev/null
+/*
+ * Copyright (C) 2013-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+#include <common.h>
+
+DECLARE_GLOBAL_DATA_PTR;
+
+static ulong get_sp(void)
+{
+ ulong ret;
+
+ asm("mov %0, sp" : "=r"(ret) : );
+ return ret;
+}
+
+void arch_lmb_reserve(struct lmb *lmb)
+{
+ ulong sp;
+
+ /*
+ * Booting a (Linux) kernel image
+ *
+ * Allocate space for command line and board info - the
+ * address should be as high as possible within the reach of
+ * the kernel (see CONFIG_SYS_BOOTMAPSZ settings), but in unused
+ * memory, which means far enough below the current stack
+ * pointer.
+ */
+ sp = get_sp();
+ debug("## Current stack ends at 0x%08lx ", sp);
+
+ /* adjust sp by 4K to be safe */
+ sp -= 4096;
+ lmb_reserve(lmb, sp, (CONFIG_SYS_SDRAM_BASE + gd->ram_size - sp));
+}
+
+static int cleanup_before_linux(void)
+{
+ disable_interrupts();
+ flush_dcache_all();
+ invalidate_icache_all();
+
+ return 0;
+}
+
+/* Subcommand: PREP */
+static void boot_prep_linux(bootm_headers_t *images)
+{
+ if (image_setup_linux(images))
+ hang();
+}
+
+/* Subcommand: GO */
+static void boot_jump_linux(bootm_headers_t *images, int flag)
+{
+ void (*kernel_entry)(int zero, int arch, uint params);
+ unsigned int r0, r2;
+ int fake = (flag & BOOTM_STATE_OS_FAKE_GO);
+
+ kernel_entry = (void (*)(int, int, uint))images->ep;
+
+ debug("## Transferring control to Linux (at address %08lx)...\n",
+ (ulong) kernel_entry);
+ bootstage_mark(BOOTSTAGE_ID_RUN_OS);
+
+ printf("\nStarting kernel ...%s\n\n", fake ?
+ "(fake run for tracing)" : "");
+ bootstage_mark_name(BOOTSTAGE_ID_BOOTM_HANDOFF, "start_kernel");
+
+ cleanup_before_linux();
+
+ if (IMAGE_ENABLE_OF_LIBFDT && images->ft_len) {
+ r0 = 2;
+ r2 = (unsigned int)images->ft_addr;
+ } else {
+ r0 = 1;
+ r2 = (unsigned int)getenv("bootargs");
+ }
+
+ if (!fake)
+ kernel_entry(r0, 0, r2);
+}
+
+int do_bootm_linux(int flag, int argc, char *argv[], bootm_headers_t *images)
+{
+ /* No need for those on ARC */
+ if ((flag & BOOTM_STATE_OS_BD_T) || (flag & BOOTM_STATE_OS_CMDLINE))
+ return -1;
+
+ if (flag & BOOTM_STATE_OS_PREP) {
+ boot_prep_linux(images);
+ return 0;
+ }
+
+ if (flag & (BOOTM_STATE_OS_GO | BOOTM_STATE_OS_FAKE_GO)) {
+ boot_jump_linux(images, flag);
+ return 0;
+ }
+
+ boot_prep_linux(images);
+ boot_jump_linux(images, flag);
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+#ifdef __LITTLE_ENDIAN__
+#define WORD2 r2
+#define SHIFT r3
+#else /* __BIG_ENDIAN__ */
+#define WORD2 r3
+#define SHIFT r2
+#endif /* _ENDIAN__ */
+
+.global memcmp
+.align 4
+memcmp:
+ or %r12, %r0, %r1
+ asl_s %r12, %r12, 30
+ sub %r3, %r2, 1
+ brls %r2, %r12, .Lbytewise
+ ld %r4, [%r0, 0]
+ ld %r5, [%r1, 0]
+ lsr.f %lp_count, %r3, 3
+ lpne .Loop_end
+ ld_s WORD2, [%r0, 4]
+ ld_s %r12, [%r1, 4]
+ brne %r4, %r5, .Leven
+ ld.a %r4, [%r0, 8]
+ ld.a %r5, [%r1, 8]
+ brne WORD2, %r12, .Lodd
+.Loop_end:
+ asl_s SHIFT, SHIFT, 3
+ bhs_s .Last_cmp
+ brne %r4, %r5, .Leven
+ ld %r4, [%r0, 4]
+ ld %r5, [%r1, 4]
+#ifdef __LITTLE_ENDIAN__
+ nop_s
+ /* one more load latency cycle */
+.Last_cmp:
+ xor %r0, %r4, %r5
+ bset %r0, %r0, SHIFT
+ sub_s %r1, %r0, 1
+ bic_s %r1, %r1, %r0
+ norm %r1, %r1
+ b.d .Leven_cmp
+ and %r1, %r1, 24
+.Leven:
+ xor %r0, %r4, %r5
+ sub_s %r1, %r0, 1
+ bic_s %r1, %r1, %r0
+ norm %r1, %r1
+ /* slow track insn */
+ and %r1, %r1, 24
+.Leven_cmp:
+ asl %r2, %r4, %r1
+ asl %r12, %r5, %r1
+ lsr_s %r2, %r2, 1
+ lsr_s %r12, %r12, 1
+ j_s.d [%blink]
+ sub %r0, %r2, %r12
+ .balign 4
+.Lodd:
+ xor %r0, WORD2, %r12
+ sub_s %r1, %r0, 1
+ bic_s %r1, %r1, %r0
+ norm %r1, %r1
+ /* slow track insn */
+ and %r1, %r1, 24
+ asl_s %r2, %r2, %r1
+ asl_s %r12, %r12, %r1
+ lsr_s %r2, %r2, 1
+ lsr_s %r12, %r12, 1
+ j_s.d [%blink]
+ sub %r0, %r2, %r12
+#else /* __BIG_ENDIAN__ */
+.Last_cmp:
+ neg_s SHIFT, SHIFT
+ lsr %r4, %r4, SHIFT
+ lsr %r5, %r5, SHIFT
+ /* slow track insn */
+.Leven:
+ sub.f %r0, %r4, %r5
+ mov.ne %r0, 1
+ j_s.d [%blink]
+ bset.cs %r0, %r0, 31
+.Lodd:
+ cmp_s WORD2, %r12
+
+ mov_s %r0, 1
+ j_s.d [%blink]
+ bset.cs %r0, %r0, 31
+#endif /* _ENDIAN__ */
+ .balign 4
+.Lbytewise:
+ breq %r2, 0, .Lnil
+ ldb %r4, [%r0, 0]
+ ldb %r5, [%r1, 0]
+ lsr.f %lp_count, %r3
+ lpne .Lbyte_end
+ ldb_s %r3, [%r0, 1]
+ ldb %r12, [%r1, 1]
+ brne %r4, %r5, .Lbyte_even
+ ldb.a %r4, [%r0, 2]
+ ldb.a %r5, [%r1, 2]
+ brne %r3, %r12, .Lbyte_odd
+.Lbyte_end:
+ bcc .Lbyte_even
+ brne %r4, %r5, .Lbyte_even
+ ldb_s %r3, [%r0, 1]
+ ldb_s %r12, [%r1, 1]
+.Lbyte_odd:
+ j_s.d [%blink]
+ sub %r0, %r3, %r12
+.Lbyte_even:
+ j_s.d [%blink]
+ sub %r0, %r4, %r5
+.Lnil:
+ j_s.d [%blink]
+ mov %r0, 0
--- /dev/null
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+.global memcpy
+.align 4
+memcpy:
+ or %r3, %r0, %r1
+ asl_s %r3, %r3, 30
+ mov_s %r5, %r0
+ brls.d %r2, %r3, .Lcopy_bytewise
+ sub.f %r3, %r2, 1
+ ld_s %r12, [%r1, 0]
+ asr.f %lp_count, %r3, 3
+ bbit0.d %r3, 2, .Lnox4
+ bmsk_s %r2, %r2, 1
+ st.ab %r12, [%r5, 4]
+ ld.a %r12, [%r1, 4]
+.Lnox4:
+ lppnz .Lendloop
+ ld_s %r3, [%r1, 4]
+ st.ab %r12, [%r5, 4]
+ ld.a %r12, [%r1, 8]
+ st.ab %r3, [%r5, 4]
+.Lendloop:
+ breq %r2, 0, .Last_store
+ ld %r3, [%r5, 0]
+#ifdef __LITTLE_ENDIAN__
+ add3 %r2, -1, %r2
+ /* uses long immediate */
+ xor_s %r12, %r12, %r3
+ bmsk %r12, %r12, %r2
+ xor_s %r12, %r12, %r3
+#else /* __BIG_ENDIAN__ */
+ sub3 %r2, 31, %r2
+ /* uses long immediate */
+ xor_s %r3, %r3, %r12
+ bmsk %r3, %r3, %r2
+ xor_s %r12, %r12, %r3
+#endif /* _ENDIAN__ */
+.Last_store:
+ j_s.d [%blink]
+ st %r12, [%r5, 0]
+
+ .balign 4
+.Lcopy_bytewise:
+ jcs [%blink]
+ ldb_s %r12, [%r1, 0]
+ lsr.f %lp_count, %r3
+ bhs_s .Lnox1
+ stb.ab %r12, [%r5, 1]
+ ldb.a %r12, [%r1, 1]
+.Lnox1:
+ lppnz .Lendbloop
+ ldb_s %r3, [%r1, 1]
+ stb.ab %r12, [%r5, 1]
+ ldb.a %r12, [%r1, 2]
+ stb.ab %r3, [%r5, 1]
+.Lendbloop:
+ j_s.d [%blink]
+ stb %r12, [%r5, 0]
--- /dev/null
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */
+
+.global memset
+.align 4
+memset:
+ mov_s %r4, %r0
+ or %r12, %r0, %r2
+ bmsk.f %r12, %r12, 1
+ extb_s %r1, %r1
+ asl %r3, %r1, 8
+ beq.d .Laligned
+ or_s %r1, %r1, %r3
+ brls %r2, SMALL, .Ltiny
+ add %r3, %r2, %r0
+ stb %r1, [%r3, -1]
+ bclr_s %r3, %r3, 0
+ stw %r1, [%r3, -2]
+ bmsk.f %r12, %r0, 1
+ add_s %r2, %r2, %r12
+ sub.ne %r2, %r2, 4
+ stb.ab %r1, [%r4, 1]
+ and %r4, %r4, -2
+ stw.ab %r1, [%r4, 2]
+ and %r4, %r4, -4
+
+ .balign 4
+.Laligned:
+ asl %r3, %r1, 16
+ lsr.f %lp_count, %r2, 2
+ or_s %r1, %r1, %r3
+ lpne .Loop_end
+ st.ab %r1, [%r4, 4]
+.Loop_end:
+ j_s [%blink]
+
+ .balign 4
+.Ltiny:
+ mov.f %lp_count, %r2
+ lpne .Ltiny_end
+ stb.ab %r1, [%r4, 1]
+.Ltiny_end:
+ j_s [%blink]
+
+/*
+ * memzero: @r0 = mem, @r1 = size_t
+ * memset: @r0 = mem, @r1 = char, @r2 = size_t
+ */
+
+.global memzero
+.align 4
+memzero:
+ /* adjust bzero args to memset args */
+ mov %r2, %r1
+ mov %r1, 0
+ /* tail call so need to tinker with blink */
+ b memset
--- /dev/null
+/*
+ * Copyright (C) 2013-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+#include <common.h>
+#include <elf.h>
+#include <asm/sections.h>
+
+DECLARE_GLOBAL_DATA_PTR;
+
+/*
+ * Base functionality is taken from x86 version with added ARC-specifics
+ */
+int do_elf_reloc_fixups(void)
+{
+ Elf32_Rela *re_src = (Elf32_Rela *)(&__rel_dyn_start);
+ Elf32_Rela *re_end = (Elf32_Rela *)(&__rel_dyn_end);
+
+ Elf32_Addr *offset_ptr_rom, *last_offset = NULL;
+ Elf32_Addr *offset_ptr_ram;
+
+ do {
+ /* Get the location from the relocation entry */
+ offset_ptr_rom = (Elf32_Addr *)re_src->r_offset;
+
+ /* Check that the location of the relocation is in .text */
+ if (offset_ptr_rom >= (Elf32_Addr *)CONFIG_SYS_TEXT_BASE &&
+ offset_ptr_rom > last_offset) {
+ unsigned int val;
+ /* Switch to the in-RAM version */
+ offset_ptr_ram = (Elf32_Addr *)((ulong)offset_ptr_rom +
+ gd->reloc_off);
+
+ /*
+ * Use "memcpy" because target location might be
+ * 16-bit aligned on ARC so we may need to read
+ * byte-by-byte. On attempt to read entire word by
+ * CPU throws an exception
+ */
+ memcpy(&val, offset_ptr_ram, sizeof(int));
+
+ /* If location in ".text" section swap value */
+ if ((unsigned int)offset_ptr_rom <
+ (unsigned int)&__text_end)
+ val = (val << 16) | (val >> 16);
+
+ /* Check that the target points into .text */
+ if (val >= CONFIG_SYS_TEXT_BASE && val <=
+ (unsigned int)&__bss_end) {
+ val += gd->reloc_off;
+ /* If location in ".text" section swap value */
+ if ((unsigned int)offset_ptr_rom <
+ (unsigned int)&__text_end)
+ val = (val << 16) | (val >> 16);
+ memcpy(offset_ptr_ram, &val, sizeof(int));
+ } else {
+ debug(" %p: rom reloc %x, ram %p, value %x, limit %x\n",
+ re_src, re_src->r_offset, offset_ptr_ram,
+ val, (unsigned int)&__bss_end);
+ }
+ } else {
+ debug(" %p: rom reloc %x, last %p\n", re_src,
+ re_src->r_offset, last_offset);
+ }
+ last_offset = offset_ptr_rom;
+
+ } while (++re_src < re_end);
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (C) 2013-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+/*
+ * For some reason linker sets linker-generated symbols to zero in PIE mode.
+ * A work-around is substitution of linker-generated symbols with
+ * compiler-generated symbols which are properly handled by linker in PAE mode.
+ */
+
+char __bss_start[0] __attribute__((section(".__bss_start")));
+char __bss_end[0] __attribute__((section(".__bss_end")));
+char __image_copy_start[0] __attribute__((section(".__image_copy_start")));
+char __image_copy_end[0] __attribute__((section(".__image_copy_end")));
+char __rel_dyn_start[0] __attribute__((section(".__rel_dyn_start")));
+char __rel_dyn_end[0] __attribute__((section(".__rel_dyn_end")));
+char __text_start[0] __attribute__((section(".__text_start")));
+char __text_end[0] __attribute__((section(".__text_end")));
+char __init_end[0] __attribute__((section(".__init_end")));
--- /dev/null
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+/*
+ * ARC700 has a relatively long pipeline and branch prediction, so we want
+ * to avoid branches that are hard to predict. On the other hand, the
+ * presence of the norm instruction makes it easier to operate on whole
+ * words branch-free.
+ */
+
+.global strchr
+.align 4
+strchr:
+ extb_s %r1, %r1
+ asl %r5, %r1, 8
+ bmsk %r2, %r0, 1
+ or %r5, %r5, %r1
+ mov_s %r3, 0x01010101
+ breq.d %r2, %r0, .Laligned
+ asl %r4, %r5, 16
+ sub_s %r0, %r0, %r2
+ asl %r7, %r2, 3
+ ld_s %r2, [%r0]
+#ifdef __LITTLE_ENDIAN__
+ asl %r7, %r3, %r7
+#else /* __BIG_ENDIAN__ */
+ lsr %r7, %r3, %r7
+#endif /* _ENDIAN__ */
+ or %r5, %r5, %r4
+ ror %r4, %r3
+ sub %r12, %r2, %r7
+ bic_s %r12, %r12, %r2
+ and %r12, %r12, %r4
+ brne.d %r12, 0, .Lfound0_ua
+ xor %r6, %r2, %r5
+ ld.a %r2, [%r0, 4]
+ sub %r12, %r6, %r7
+ bic %r12, %r12, %r6
+#ifdef __LITTLE_ENDIAN__
+ and %r7, %r12, %r4
+ /* For speed, we want this branch to be unaligned. */
+ breq %r7, 0, .Loop
+ /* Likewise this one */
+ b .Lfound_char
+#else /* __BIG_ENDIAN__ */
+ and %r12, %r12, %r4
+ /* For speed, we want this branch to be unaligned. */
+ breq %r12, 0, .Loop
+ lsr_s %r12, %r12, 7
+ bic %r2, %r7, %r6
+ b.d .Lfound_char_b
+ and_s %r2, %r2, %r12
+#endif /* _ENDIAN__ */
+ /* We require this code address to be unaligned for speed... */
+.Laligned:
+ ld_s %r2, [%r0]
+ or %r5, %r5, %r4
+ ror %r4, %r3
+ /* ... so that this code address is aligned, for itself and ... */
+.Loop:
+ sub %r12, %r2, %r3
+ bic_s %r12, %r12, %r2
+ and %r12, %r12, %r4
+ brne.d %r12, 0, .Lfound0
+ xor %r6, %r2, %r5
+ ld.a %r2, [%r0, 4]
+ sub %r12, %r6, %r3
+ bic %r12, %r12, %r6
+ and %r7, %r12, %r4
+ breq %r7, 0, .Loop
+ /*
+ *... so that this branch is unaligned.
+ * Found searched-for character.
+ * r0 has already advanced to next word.
+ */
+#ifdef __LITTLE_ENDIAN__
+ /*
+ * We only need the information about the first matching byte
+ * (i.e. the least significant matching byte) to be exact,
+ * hence there is no problem with carry effects.
+ */
+.Lfound_char:
+ sub %r3, %r7, 1
+ bic %r3, %r3, %r7
+ norm %r2, %r3
+ sub_s %r0, %r0, 1
+ asr_s %r2, %r2, 3
+ j.d [%blink]
+ sub_s %r0, %r0, %r2
+
+ .balign 4
+.Lfound0_ua:
+ mov %r3, %r7
+.Lfound0:
+ sub %r3, %r6, %r3
+ bic %r3, %r3, %r6
+ and %r2, %r3, %r4
+ or_s %r12, %r12, %r2
+ sub_s %r3, %r12, 1
+ bic_s %r3, %r3, %r12
+ norm %r3, %r3
+ add_s %r0, %r0, 3
+ asr_s %r12, %r3, 3
+ asl.f 0, %r2, %r3
+ sub_s %r0, %r0, %r12
+ j_s.d [%blink]
+ mov.pl %r0, 0
+#else /* __BIG_ENDIAN__ */
+.Lfound_char:
+ lsr %r7, %r7, 7
+
+ bic %r2, %r7, %r6
+.Lfound_char_b:
+ norm %r2, %r2
+ sub_s %r0, %r0, 4
+ asr_s %r2, %r2, 3
+ j.d [%blink]
+ add_s %r0, %r0, %r2
+
+.Lfound0_ua:
+ mov_s %r3, %r7
+.Lfound0:
+ asl_s %r2, %r2, 7
+ or %r7, %r6, %r4
+ bic_s %r12, %r12, %r2
+ sub %r2, %r7, %r3
+ or %r2, %r2, %r6
+ bic %r12, %r2, %r12
+ bic.f %r3, %r4, %r12
+ norm %r3, %r3
+
+ add.pl %r3, %r3, 1
+ asr_s %r12, %r3, 3
+ asl.f 0, %r2, %r3
+ add_s %r0, %r0, %r12
+ j_s.d [%blink]
+ mov.mi %r0, 0
+#endif /* _ENDIAN__ */
--- /dev/null
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+/*
+ * This is optimized primarily for the ARC700.
+ * It would be possible to speed up the loops by one cycle / word
+ * respective one cycle / byte by forcing double source 1 alignment, unrolling
+ * by a factor of two, and speculatively loading the second word / byte of
+ * source 1; however, that would increase the overhead for loop setup / finish,
+ * and strcmp might often terminate early.
+ */
+
+.global strcmp
+.align 4
+strcmp:
+ or %r2, %r0, %r1
+ bmsk_s %r2, %r2, 1
+ brne %r2, 0, .Lcharloop
+ mov_s %r12, 0x01010101
+ ror %r5, %r12
+.Lwordloop:
+ ld.ab %r2, [%r0, 4]
+ ld.ab %r3, [%r1, 4]
+ nop_s
+ sub %r4, %r2, %r12
+ bic %r4, %r4, %r2
+ and %r4, %r4, %r5
+ brne %r4, 0, .Lfound0
+ breq %r2 ,%r3, .Lwordloop
+#ifdef __LITTLE_ENDIAN__
+ xor %r0, %r2, %r3 /* mask for difference */
+ sub_s %r1, %r0, 1
+ bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
+ sub %r1, %r5, %r0
+ xor %r0, %r5, %r1 /* mask for least significant difference byte */
+ and_s %r2, %r2, %r0
+ and_s %r3, %r3, %r0
+#endif /* _ENDIAN__ */
+ cmp_s %r2, %r3
+ mov_s %r0, 1
+ j_s.d [%blink]
+ bset.lo %r0, %r0, 31
+
+ .balign 4
+#ifdef __LITTLE_ENDIAN__
+.Lfound0:
+ xor %r0, %r2, %r3 /* mask for difference */
+ or %r0, %r0, %r4 /* or in zero indicator */
+ sub_s %r1, %r0, 1
+ bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
+ sub %r1, %r5, %r0
+ xor %r0, %r5, %r1 /* mask for least significant difference byte */
+ and_s %r2, %r2, %r0
+ and_s %r3, %r3, %r0
+ sub.f %r0, %r2, %r3
+ mov.hi %r0, 1
+ j_s.d [%blink]
+ bset.lo %r0, %r0, 31
+#else /* __BIG_ENDIAN__ */
+ /*
+ * The zero-detection above can mis-detect 0x01 bytes as zeroes
+ * because of carry-propagateion from a lower significant zero byte.
+ * We can compensate for this by checking that bit0 is zero.
+ * This compensation is not necessary in the step where we
+ * get a low estimate for r2, because in any affected bytes
+ * we already have 0x00 or 0x01, which will remain unchanged
+ * when bit 7 is cleared.
+ */
+ .balign 4
+.Lfound0:
+ lsr %r0, %r4, 8
+ lsr_s %r1, %r2
+ bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */
+ bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */
+ or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */
+ cmp_s %r3, %r2 /* ... be independent of trailing garbage */
+ or_s %r2, %r2, %r0 /* likewise for r3 > r2 */
+ bic_s %r3, %r3, %r0
+ rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */
+ cmp_s %r2, %r3
+ j_s.d [%blink]
+ bset.lo %r0, %r0, 31
+#endif /* _ENDIAN__ */
+
+ .balign 4
+.Lcharloop:
+ ldb.ab %r2,[%r0,1]
+ ldb.ab %r3,[%r1,1]
+ nop_s
+ breq %r2, 0, .Lcmpend
+ breq %r2, %r3, .Lcharloop
+.Lcmpend:
+ j_s.d [%blink]
+ sub %r0, %r2, %r3
--- /dev/null
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+/*
+ * If dst and src are 4 byte aligned, copy 8 bytes at a time.
+ * If the src is 4, but not 8 byte aligned, we first read 4 bytes to get
+ * it 8 byte aligned. Thus, we can do a little read-ahead, without
+ * dereferencing a cache line that we should not touch.
+ * Note that short and long instructions have been scheduled to avoid
+ * branch stalls.
+ * The beq_s to r3z could be made unaligned & long to avoid a stall
+ * there, but it is not likely to be taken often, and it would also be likely
+ * to cost an unaligned mispredict at the next call.
+ */
+
+.global strcpy
+.align 4
+strcpy:
+ or %r2, %r0, %r1
+ bmsk_s %r2, %r2, 1
+ brne.d %r2, 0, charloop
+ mov_s %r10, %r0
+ ld_s %r3, [%r1, 0]
+ mov %r8, 0x01010101
+ bbit0.d %r1, 2, loop_start
+ ror %r12, %r8
+ sub %r2, %r3, %r8
+ bic_s %r2, %r2, %r3
+ tst_s %r2,%r12
+ bne r3z
+ mov_s %r4,%r3
+ .balign 4
+loop:
+ ld.a %r3, [%r1, 4]
+ st.ab %r4, [%r10, 4]
+loop_start:
+ ld.a %r4, [%r1, 4]
+ sub %r2, %r3, %r8
+ bic_s %r2, %r2, %r3
+ tst_s %r2, %r12
+ bne_s r3z
+ st.ab %r3, [%r10, 4]
+ sub %r2, %r4, %r8
+ bic %r2, %r2, %r4
+ tst %r2, %r12
+ beq loop
+ mov_s %r3, %r4
+#ifdef __LITTLE_ENDIAN__
+r3z: bmsk.f %r1, %r3, 7
+ lsr_s %r3, %r3, 8
+#else /* __BIG_ENDIAN__ */
+r3z: lsr.f %r1, %r3, 24
+ asl_s %r3, %r3, 8
+#endif /* _ENDIAN__ */
+ bne.d r3z
+ stb.ab %r1, [%r10, 1]
+ j_s [%blink]
+
+ .balign 4
+charloop:
+ ldb.ab %r3, [%r1, 1]
+ brne.d %r3, 0, charloop
+ stb.ab %r3, [%r10, 1]
+ j [%blink]
--- /dev/null
+/*
+ * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0+
+ */
+
+.global strlen
+.align 4
+strlen:
+ or %r3, %r0, 7
+ ld %r2, [%r3, -7]
+ ld.a %r6, [%r3, -3]
+ mov %r4, 0x01010101
+ /* uses long immediate */
+#ifdef __LITTLE_ENDIAN__
+ asl_s %r1, %r0, 3
+ btst_s %r0, 2
+ asl %r7, %r4, %r1
+ ror %r5, %r4
+ sub %r1, %r2, %r7
+ bic_s %r1, %r1, %r2
+ mov.eq %r7, %r4
+ sub %r12, %r6, %r7
+ bic %r12, %r12, %r6
+ or.eq %r12, %r12, %r1
+ and %r12, %r12, %r5
+ brne %r12, 0, .Learly_end
+#else /* __BIG_ENDIAN__ */
+ ror %r5, %r4
+ btst_s %r0, 2
+ mov_s %r1, 31
+ sub3 %r7, %r1, %r0
+ sub %r1, %r2, %r4
+ bic_s %r1, %r1, %r2
+ bmsk %r1, %r1, %r7
+ sub %r12, %r6, %r4
+ bic %r12, %r12, %r6
+ bmsk.ne %r12, %r12, %r7
+ or.eq %r12, %r12, %r1
+ and %r12, %r12, %r5
+ brne %r12, 0, .Learly_end
+#endif /* _ENDIAN__ */
+
+.Loop:
+ ld_s %r2, [%r3, 4]
+ ld.a %r6, [%r3, 8]
+ /* stall for load result */
+ sub %r1, %r2, %r4
+ bic_s %r1, %r1, %r2
+ sub %r12, %r6, %r4
+ bic %r12, %r12, %r6
+ or %r12, %r12, %r1
+ and %r12, %r12, %r5
+ breq %r12, 0, .Loop
+.Lend:
+ and.f %r1, %r1, %r5
+ sub.ne %r3, %r3, 4
+ mov.eq %r1, %r12
+#ifdef __LITTLE_ENDIAN__
+ sub_s %r2, %r1, 1
+ bic_s %r2, %r2, %r1
+ norm %r1, %r2
+ sub_s %r0, %r0, 3
+ lsr_s %r1, %r1, 3
+ sub %r0, %r3, %r0
+ j_s.d [%blink]
+ sub %r0, %r0, %r1
+#else /* __BIG_ENDIAN__ */
+ lsr_s %r1, %r1, 7
+ mov.eq %r2, %r6
+ bic_s %r1, %r1, %r2
+ norm %r1, %r1
+ sub %r0, %r3, %r0
+ lsr_s %r1, %r1, 3
+ j_s.d [%blink]
+ add %r0, %r0, %r1
+#endif /* _ENDIAN */
+.Learly_end:
+ b.d .Lend
+ sub_s.ne %r1, %r1, %r1