Hand optimised asm code for ARC700 pipeline. Originally written/optimized by Joern Rennecke Signed-off-by: Vineet Gupta <vgupta@synopsys.com> Cc: Joern Rennecke <joern.rennecke@embecosm.com>tirimbino
parent
6e35fa2d43
commit
5210d1e688
@ -0,0 +1,40 @@ |
||||
/*
|
||||
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify |
||||
* it under the terms of the GNU General Public License version 2 as |
||||
* published by the Free Software Foundation. |
||||
* |
||||
* vineetg: May 2011 |
||||
* -We had half-optimised memset/memcpy, got better versions of those |
||||
* -Added memcmp, strchr, strcpy, strcmp, strlen |
||||
* |
||||
* Amit Bhor: Codito Technologies 2004 |
||||
*/ |
||||
|
||||
#ifndef _ASM_ARC_STRING_H |
||||
#define _ASM_ARC_STRING_H |
||||
|
||||
#include <linux/types.h> |
||||
|
||||
#ifdef __KERNEL__ |
||||
|
||||
#define __HAVE_ARCH_MEMSET |
||||
#define __HAVE_ARCH_MEMCPY |
||||
#define __HAVE_ARCH_MEMCMP |
||||
#define __HAVE_ARCH_STRCHR |
||||
#define __HAVE_ARCH_STRCPY |
||||
#define __HAVE_ARCH_STRCMP |
||||
#define __HAVE_ARCH_STRLEN |
||||
|
||||
extern void *memset(void *ptr, int, __kernel_size_t); |
||||
extern void *memcpy(void *, const void *, __kernel_size_t); |
||||
extern void memzero(void *ptr, __kernel_size_t n); |
||||
extern int memcmp(const void *, const void *, __kernel_size_t); |
||||
extern char *strchr(const char *s, int c); |
||||
extern char *strcpy(char *dest, const char *src); |
||||
extern int strcmp(const char *cs, const char *ct); |
||||
extern __kernel_size_t strlen(const char *); |
||||
|
||||
#endif /* __KERNEL__ */ |
||||
#endif /* _ASM_ARC_STRING_H */ |
@ -0,0 +1,124 @@ |
||||
/* |
||||
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as |
||||
* published by the Free Software Foundation. |
||||
*/ |
||||
|
||||
#include <asm/linkage.h> |
||||
|
||||
#ifdef __LITTLE_ENDIAN__ |
||||
#define WORD2 r2 |
||||
#define SHIFT r3 |
||||
#else /* BIG ENDIAN */ |
||||
#define WORD2 r3 |
||||
#define SHIFT r2 |
||||
#endif |
||||
|
||||
ARC_ENTRY memcmp |
||||
or r12,r0,r1 |
||||
asl_s r12,r12,30 |
||||
sub r3,r2,1 |
||||
brls r2,r12,.Lbytewise |
||||
ld r4,[r0,0] |
||||
ld r5,[r1,0] |
||||
lsr.f lp_count,r3,3 |
||||
lpne .Loop_end |
||||
ld_s WORD2,[r0,4] |
||||
ld_s r12,[r1,4] |
||||
brne r4,r5,.Leven |
||||
ld.a r4,[r0,8] |
||||
ld.a r5,[r1,8] |
||||
brne WORD2,r12,.Lodd |
||||
.Loop_end: |
||||
asl_s SHIFT,SHIFT,3 |
||||
bhs_s .Last_cmp |
||||
brne r4,r5,.Leven |
||||
ld r4,[r0,4] |
||||
ld r5,[r1,4] |
||||
#ifdef __LITTLE_ENDIAN__ |
||||
nop_s |
||||
; one more load latency cycle
|
||||
.Last_cmp: |
||||
xor r0,r4,r5 |
||||
bset r0,r0,SHIFT |
||||
sub_s r1,r0,1 |
||||
bic_s r1,r1,r0 |
||||
norm r1,r1 |
||||
b.d .Leven_cmp |
||||
and r1,r1,24 |
||||
.Leven: |
||||
xor r0,r4,r5 |
||||
sub_s r1,r0,1 |
||||
bic_s r1,r1,r0 |
||||
norm r1,r1 |
||||
; slow track insn
|
||||
and r1,r1,24 |
||||
.Leven_cmp: |
||||
asl r2,r4,r1 |
||||
asl r12,r5,r1 |
||||
lsr_s r2,r2,1 |
||||
lsr_s r12,r12,1 |
||||
j_s.d [blink] |
||||
sub r0,r2,r12 |
||||
.balign 4
|
||||
.Lodd: |
||||
xor r0,WORD2,r12 |
||||
sub_s r1,r0,1 |
||||
bic_s r1,r1,r0 |
||||
norm r1,r1 |
||||
; slow track insn
|
||||
and r1,r1,24 |
||||
asl_s r2,r2,r1 |
||||
asl_s r12,r12,r1 |
||||
lsr_s r2,r2,1 |
||||
lsr_s r12,r12,1 |
||||
j_s.d [blink] |
||||
sub r0,r2,r12 |
||||
#else /* BIG ENDIAN */ |
||||
.Last_cmp: |
||||
neg_s SHIFT,SHIFT |
||||
lsr r4,r4,SHIFT |
||||
lsr r5,r5,SHIFT |
||||
; slow track insn
|
||||
.Leven: |
||||
sub.f r0,r4,r5 |
||||
mov.ne r0,1 |
||||
j_s.d [blink] |
||||
bset.cs r0,r0,31 |
||||
.Lodd: |
||||
cmp_s WORD2,r12 |
||||
|
||||
mov_s r0,1 |
||||
j_s.d [blink] |
||||
bset.cs r0,r0,31 |
||||
#endif /* ENDIAN */ |
||||
.balign 4
|
||||
.Lbytewise: |
||||
breq r2,0,.Lnil |
||||
ldb r4,[r0,0] |
||||
ldb r5,[r1,0] |
||||
lsr.f lp_count,r3 |
||||
lpne .Lbyte_end |
||||
ldb_s r3,[r0,1] |
||||
ldb r12,[r1,1] |
||||
brne r4,r5,.Lbyte_even |
||||
ldb.a r4,[r0,2] |
||||
ldb.a r5,[r1,2] |
||||
brne r3,r12,.Lbyte_odd |
||||
.Lbyte_end: |
||||
bcc .Lbyte_even |
||||
brne r4,r5,.Lbyte_even |
||||
ldb_s r3,[r0,1] |
||||
ldb_s r12,[r1,1] |
||||
.Lbyte_odd: |
||||
j_s.d [blink] |
||||
sub r0,r3,r12 |
||||
.Lbyte_even: |
||||
j_s.d [blink] |
||||
sub r0,r4,r5 |
||||
.Lnil: |
||||
j_s.d [blink] |
||||
mov r0,0 |
||||
ARC_EXIT memcmp |
@ -0,0 +1,66 @@ |
||||
/* |
||||
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as |
||||
* published by the Free Software Foundation. |
||||
*/ |
||||
|
||||
#include <asm/linkage.h> |
||||
|
||||
ARC_ENTRY memcpy |
||||
or r3,r0,r1 |
||||
asl_s r3,r3,30 |
||||
mov_s r5,r0 |
||||
brls.d r2,r3,.Lcopy_bytewise |
||||
sub.f r3,r2,1 |
||||
ld_s r12,[r1,0] |
||||
asr.f lp_count,r3,3 |
||||
bbit0.d r3,2,.Lnox4 |
||||
bmsk_s r2,r2,1 |
||||
st.ab r12,[r5,4] |
||||
ld.a r12,[r1,4] |
||||
.Lnox4: |
||||
lppnz .Lendloop |
||||
ld_s r3,[r1,4] |
||||
st.ab r12,[r5,4] |
||||
ld.a r12,[r1,8] |
||||
st.ab r3,[r5,4] |
||||
.Lendloop: |
||||
breq r2,0,.Last_store |
||||
ld r3,[r5,0] |
||||
#ifdef __LITTLE_ENDIAN__ |
||||
add3 r2,-1,r2 |
||||
; uses long immediate
|
||||
xor_s r12,r12,r3 |
||||
bmsk r12,r12,r2 |
||||
xor_s r12,r12,r3 |
||||
#else /* BIG ENDIAN */ |
||||
sub3 r2,31,r2 |
||||
; uses long immediate
|
||||
xor_s r3,r3,r12 |
||||
bmsk r3,r3,r2 |
||||
xor_s r12,r12,r3 |
||||
#endif /* ENDIAN */ |
||||
.Last_store: |
||||
j_s.d [blink] |
||||
st r12,[r5,0] |
||||
|
||||
.balign 4
|
||||
.Lcopy_bytewise: |
||||
jcs [blink] |
||||
ldb_s r12,[r1,0] |
||||
lsr.f lp_count,r3 |
||||
bhs_s .Lnox1 |
||||
stb.ab r12,[r5,1] |
||||
ldb.a r12,[r1,1] |
||||
.Lnox1: |
||||
lppnz .Lendbloop |
||||
ldb_s r3,[r1,1] |
||||
stb.ab r12,[r5,1] |
||||
ldb.a r12,[r1,2] |
||||
stb.ab r3,[r5,1] |
||||
.Lendbloop: |
||||
j_s.d [blink] |
||||
stb r12,[r5,0] |
||||
ARC_EXIT memcpy |
@ -0,0 +1,59 @@ |
||||
/* |
||||
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as |
||||
* published by the Free Software Foundation. |
||||
*/ |
||||
|
||||
#include <asm/linkage.h> |
||||
|
||||
#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */ |
||||
|
||||
ARC_ENTRY memset |
||||
mov_s r4,r0 |
||||
or r12,r0,r2 |
||||
bmsk.f r12,r12,1 |
||||
extb_s r1,r1 |
||||
asl r3,r1,8 |
||||
beq.d .Laligned |
||||
or_s r1,r1,r3 |
||||
brls r2,SMALL,.Ltiny |
||||
add r3,r2,r0 |
||||
stb r1,[r3,-1] |
||||
bclr_s r3,r3,0 |
||||
stw r1,[r3,-2] |
||||
bmsk.f r12,r0,1 |
||||
add_s r2,r2,r12 |
||||
sub.ne r2,r2,4 |
||||
stb.ab r1,[r4,1] |
||||
and r4,r4,-2 |
||||
stw.ab r1,[r4,2] |
||||
and r4,r4,-4 |
||||
.Laligned: ; This code address should be aligned for speed.
|
||||
asl r3,r1,16 |
||||
lsr.f lp_count,r2,2 |
||||
or_s r1,r1,r3 |
||||
lpne .Loop_end |
||||
st.ab r1,[r4,4] |
||||
.Loop_end: |
||||
j_s [blink] |
||||
|
||||
.balign 4
|
||||
.Ltiny: |
||||
mov.f lp_count,r2 |
||||
lpne .Ltiny_end |
||||
stb.ab r1,[r4,1] |
||||
.Ltiny_end: |
||||
j_s [blink] |
||||
ARC_EXIT memset |
||||
|
||||
; memzero: @r0 = mem, @r1 = size_t
|
||||
; memset: @r0 = mem, @r1 = char, @r2 = size_t
|
||||
|
||||
ARC_ENTRY memzero |
||||
; adjust bzero args to memset args
|
||||
mov r2, r1 |
||||
mov r1, 0 |
||||
b memset ;tail call so need to tinker with blink
|
||||
ARC_EXIT memzero |
@ -0,0 +1,123 @@ |
||||
/* |
||||
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as |
||||
* published by the Free Software Foundation. |
||||
*/ |
||||
|
||||
/* ARC700 has a relatively long pipeline and branch prediction, so we want |
||||
to avoid branches that are hard to predict. On the other hand, the |
||||
presence of the norm instruction makes it easier to operate on whole |
||||
words branch-free. */ |
||||
|
||||
#include <asm/linkage.h> |
||||
|
||||
ARC_ENTRY strchr |
||||
extb_s r1,r1 |
||||
asl r5,r1,8 |
||||
bmsk r2,r0,1 |
||||
or r5,r5,r1 |
||||
mov_s r3,0x01010101 |
||||
breq.d r2,r0,.Laligned |
||||
asl r4,r5,16 |
||||
sub_s r0,r0,r2 |
||||
asl r7,r2,3 |
||||
ld_s r2,[r0] |
||||
#ifdef __LITTLE_ENDIAN__ |
||||
asl r7,r3,r7 |
||||
#else |
||||
lsr r7,r3,r7 |
||||
#endif |
||||
or r5,r5,r4 |
||||
ror r4,r3 |
||||
sub r12,r2,r7 |
||||
bic_s r12,r12,r2 |
||||
and r12,r12,r4 |
||||
brne.d r12,0,.Lfound0_ua |
||||
xor r6,r2,r5 |
||||
ld.a r2,[r0,4] |
||||
sub r12,r6,r7 |
||||
bic r12,r12,r6 |
||||
and r7,r12,r4 |
||||
breq r7,0,.Loop ; For speed, we want this branch to be unaligned.
|
||||
b .Lfound_char ; Likewise this one.
|
||||
; /* We require this code address to be unaligned for speed... */
|
||||
.Laligned: |
||||
ld_s r2,[r0] |
||||
or r5,r5,r4 |
||||
ror r4,r3 |
||||
; /* ... so that this code address is aligned, for itself and ... */
|
||||
.Loop: |
||||
sub r12,r2,r3 |
||||
bic_s r12,r12,r2 |
||||
and r12,r12,r4 |
||||
brne.d r12,0,.Lfound0 |
||||
xor r6,r2,r5 |
||||
ld.a r2,[r0,4] |
||||
sub r12,r6,r3 |
||||
bic r12,r12,r6 |
||||
and r7,r12,r4 |
||||
breq r7,0,.Loop /* ... so that this branch is unaligned. */ |
||||
; Found searched-for character. r0 has already advanced to next word.
|
||||
#ifdef __LITTLE_ENDIAN__ |
||||
/* We only need the information about the first matching byte |
||||
(i.e. the least significant matching byte) to be exact, |
||||
hence there is no problem with carry effects. */ |
||||
.Lfound_char: |
||||
sub r3,r7,1 |
||||
bic r3,r3,r7 |
||||
norm r2,r3 |
||||
sub_s r0,r0,1 |
||||
asr_s r2,r2,3 |
||||
j.d [blink] |
||||
sub_s r0,r0,r2 |
||||
|
||||
.balign 4
|
||||
.Lfound0_ua: |
||||
mov r3,r7 |
||||
.Lfound0: |
||||
sub r3,r6,r3 |
||||
bic r3,r3,r6 |
||||
and r2,r3,r4 |
||||
or_s r12,r12,r2 |
||||
sub_s r3,r12,1 |
||||
bic_s r3,r3,r12 |
||||
norm r3,r3 |
||||
add_s r0,r0,3 |
||||
asr_s r12,r3,3 |
||||
asl.f 0,r2,r3 |
||||
sub_s r0,r0,r12 |
||||
j_s.d [blink] |
||||
mov.pl r0,0 |
||||
#else /* BIG ENDIAN */ |
||||
.Lfound_char: |
||||
lsr r7,r7,7 |
||||
|
||||
bic r2,r7,r6 |
||||
norm r2,r2 |
||||
sub_s r0,r0,4 |
||||
asr_s r2,r2,3 |
||||
j.d [blink] |
||||
add_s r0,r0,r2 |
||||
|
||||
.Lfound0_ua: |
||||
mov_s r3,r7 |
||||
.Lfound0: |
||||
asl_s r2,r2,7 |
||||
or r7,r6,r4 |
||||
bic_s r12,r12,r2 |
||||
sub r2,r7,r3 |
||||
or r2,r2,r6 |
||||
bic r12,r2,r12 |
||||
bic.f r3,r4,r12 |
||||
norm r3,r3 |
||||
|
||||
add.pl r3,r3,1 |
||||
asr_s r12,r3,3 |
||||
asl.f 0,r2,r3 |
||||
add_s r0,r0,r12 |
||||
j_s.d [blink] |
||||
mov.mi r0,0 |
||||
#endif /* ENDIAN */ |
||||
ARC_EXIT strchr |
@ -0,0 +1,96 @@ |
||||
/* |
||||
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as |
||||
* published by the Free Software Foundation. |
||||
*/ |
||||
|
||||
/* This is optimized primarily for the ARC700. |
||||
It would be possible to speed up the loops by one cycle / word |
||||
respective one cycle / byte by forcing double source 1 alignment, unrolling |
||||
by a factor of two, and speculatively loading the second word / byte of |
||||
source 1; however, that would increase the overhead for loop setup / finish,
|
||||
and strcmp might often terminate early. */ |
||||
|
||||
#include <asm/linkage.h> |
||||
|
||||
ARC_ENTRY strcmp |
||||
or r2,r0,r1 |
||||
bmsk_s r2,r2,1 |
||||
brne r2,0,.Lcharloop |
||||
mov_s r12,0x01010101 |
||||
ror r5,r12 |
||||
.Lwordloop: |
||||
ld.ab r2,[r0,4] |
||||
ld.ab r3,[r1,4] |
||||
nop_s |
||||
sub r4,r2,r12 |
||||
bic r4,r4,r2 |
||||
and r4,r4,r5 |
||||
brne r4,0,.Lfound0 |
||||
breq r2,r3,.Lwordloop |
||||
#ifdef __LITTLE_ENDIAN__ |
||||
xor r0,r2,r3 ; mask for difference
|
||||
sub_s r1,r0,1 |
||||
bic_s r0,r0,r1 ; mask for least significant difference bit
|
||||
sub r1,r5,r0 |
||||
xor r0,r5,r1 ; mask for least significant difference byte
|
||||
and_s r2,r2,r0 |
||||
and_s r3,r3,r0 |
||||
#endif /* LITTLE ENDIAN */ |
||||
cmp_s r2,r3 |
||||
mov_s r0,1 |
||||
j_s.d [blink] |
||||
bset.lo r0,r0,31 |
||||
|
||||
.balign 4
|
||||
#ifdef __LITTLE_ENDIAN__ |
||||
.Lfound0: |
||||
xor r0,r2,r3 ; mask for difference
|
||||
or r0,r0,r4 ; or in zero indicator
|
||||
sub_s r1,r0,1 |
||||
bic_s r0,r0,r1 ; mask for least significant difference bit
|
||||
sub r1,r5,r0 |
||||
xor r0,r5,r1 ; mask for least significant difference byte
|
||||
and_s r2,r2,r0 |
||||
and_s r3,r3,r0 |
||||
sub.f r0,r2,r3 |
||||
mov.hi r0,1 |
||||
j_s.d [blink] |
||||
bset.lo r0,r0,31 |
||||
#else /* BIG ENDIAN */ |
||||
/* The zero-detection above can mis-detect 0x01 bytes as zeroes |
||||
because of carry-propagateion from a lower significant zero byte. |
||||
We can compensate for this by checking that bit0 is zero. |
||||
This compensation is not necessary in the step where we |
||||
get a low estimate for r2, because in any affected bytes |
||||
we already have 0x00 or 0x01, which will remain unchanged |
||||
when bit 7 is cleared. */ |
||||
.balign 4
|
||||
.Lfound0: |
||||
lsr r0,r4,8 |
||||
lsr_s r1,r2 |
||||
bic_s r2,r2,r0 ; get low estimate for r2 and get ...
|
||||
bic_s r0,r0,r1 ; <this is the adjusted mask for zeros>
|
||||
or_s r3,r3,r0 ; ... high estimate r3 so that r2 > r3 will ...
|
||||
cmp_s r3,r2 ; ... be independent of trailing garbage
|
||||
or_s r2,r2,r0 ; likewise for r3 > r2
|
||||
bic_s r3,r3,r0 |
||||
rlc r0,0 ; r0 := r2 > r3 ? 1 : 0
|
||||
cmp_s r2,r3 |
||||
j_s.d [blink] |
||||
bset.lo r0,r0,31 |
||||
#endif /* ENDIAN */ |
||||
|
||||
.balign 4
|
||||
.Lcharloop: |
||||
ldb.ab r2,[r0,1] |
||||
ldb.ab r3,[r1,1] |
||||
nop_s |
||||
breq r2,0,.Lcmpend |
||||
breq r2,r3,.Lcharloop |
||||
.Lcmpend: |
||||
j_s.d [blink] |
||||
sub r0,r2,r3 |
||||
ARC_EXIT strcmp |
@ -0,0 +1,70 @@ |
||||
/* |
||||
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as |
||||
* published by the Free Software Foundation. |
||||
*/ |
||||
|
||||
/* If dst and src are 4 byte aligned, copy 8 bytes at a time. |
||||
If the src is 4, but not 8 byte aligned, we first read 4 bytes to get |
||||
it 8 byte aligned. Thus, we can do a little read-ahead, without |
||||
dereferencing a cache line that we should not touch. |
||||
Note that short and long instructions have been scheduled to avoid |
||||
branch stalls. |
||||
The beq_s to r3z could be made unaligned & long to avoid a stall |
||||
there, but the it is not likely to be taken often, and it |
||||
would also be likey to cost an unaligned mispredict at the next call. */ |
||||
|
||||
#include <asm/linkage.h> |
||||
|
||||
ARC_ENTRY strcpy |
||||
or r2,r0,r1 |
||||
bmsk_s r2,r2,1 |
||||
brne.d r2,0,charloop |
||||
mov_s r10,r0 |
||||
ld_s r3,[r1,0] |
||||
mov r8,0x01010101 |
||||
bbit0.d r1,2,loop_start |
||||
ror r12,r8 |
||||
sub r2,r3,r8 |
||||
bic_s r2,r2,r3 |
||||
tst_s r2,r12 |
||||
bne r3z |
||||
mov_s r4,r3 |
||||
.balign 4
|
||||
loop: |
||||
ld.a r3,[r1,4] |
||||
st.ab r4,[r10,4] |
||||
loop_start: |
||||
ld.a r4,[r1,4] |
||||
sub r2,r3,r8 |
||||
bic_s r2,r2,r3 |
||||
tst_s r2,r12 |
||||
bne_s r3z |
||||
st.ab r3,[r10,4] |
||||
sub r2,r4,r8 |
||||
bic r2,r2,r4 |
||||
tst r2,r12 |
||||
beq loop |
||||
mov_s r3,r4 |
||||
#ifdef __LITTLE_ENDIAN__ |
||||
r3z: bmsk.f r1,r3,7 |
||||
lsr_s r3,r3,8 |
||||
#else |
||||
r3z: lsr.f r1,r3,24 |
||||
asl_s r3,r3,8 |
||||
#endif |
||||
bne.d r3z |
||||
stb.ab r1,[r10,1] |
||||
j_s [blink] |
||||
|
||||
.balign 4
|
||||
charloop: |
||||
ldb.ab r3,[r1,1] |
||||
|
||||
|
||||
brne.d r3,0,charloop |
||||
stb.ab r3,[r10,1] |
||||
j [blink] |
||||
ARC_EXIT strcpy |
@ -0,0 +1,83 @@ |
||||
/* |
||||
* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) |
||||
* |
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as |
||||
* published by the Free Software Foundation. |
||||
*/ |
||||
|
||||
#include <asm/linkage.h> |
||||
|
||||
ARC_ENTRY strlen |
||||
or r3,r0,7 |
||||
ld r2,[r3,-7] |
||||
ld.a r6,[r3,-3] |
||||
mov r4,0x01010101 |
||||
; uses long immediate
|
||||
#ifdef __LITTLE_ENDIAN__ |
||||
asl_s r1,r0,3 |
||||
btst_s r0,2 |
||||
asl r7,r4,r1 |
||||
ror r5,r4 |
||||
sub r1,r2,r7 |
||||
bic_s r1,r1,r2 |
||||
mov.eq r7,r4 |
||||
sub r12,r6,r7 |
||||
bic r12,r12,r6 |
||||
or.eq r12,r12,r1 |
||||
and r12,r12,r5 |
||||
brne r12,0,.Learly_end |
||||
#else /* BIG ENDIAN */ |
||||
ror r5,r4 |
||||
btst_s r0,2 |
||||
mov_s r1,31 |
||||
sub3 r7,r1,r0 |
||||
sub r1,r2,r4 |
||||
bic_s r1,r1,r2 |
||||
bmsk r1,r1,r7 |
||||
sub r12,r6,r4 |
||||
bic r12,r12,r6 |
||||
bmsk.ne r12,r12,r7 |
||||
or.eq r12,r12,r1 |
||||
and r12,r12,r5 |
||||
brne r12,0,.Learly_end |
||||
#endif /* ENDIAN */ |
||||
|
||||
.Loop: |
||||
ld_s r2,[r3,4] |
||||
ld.a r6,[r3,8] |
||||
; stall for load result
|
||||
sub r1,r2,r4 |
||||
bic_s r1,r1,r2 |
||||
sub r12,r6,r4 |
||||
bic r12,r12,r6 |
||||
or r12,r12,r1 |
||||
and r12,r12,r5 |
||||
breq r12,0,.Loop |
||||
.Lend: |
||||
and.f r1,r1,r5 |
||||
sub.ne r3,r3,4 |
||||
mov.eq r1,r12 |
||||
#ifdef __LITTLE_ENDIAN__ |
||||
sub_s r2,r1,1 |
||||
bic_s r2,r2,r1 |
||||
norm r1,r2 |
||||
sub_s r0,r0,3 |
||||
lsr_s r1,r1,3 |
||||
sub r0,r3,r0 |
||||
j_s.d [blink] |
||||
sub r0,r0,r1 |
||||
#else /* BIG ENDIAN */ |
||||
lsr_s r1,r1,7 |
||||
mov.eq r2,r6 |
||||
bic_s r1,r1,r2 |
||||
norm r1,r1 |
||||
sub r0,r3,r0 |
||||
lsr_s r1,r1,3 |
||||
j_s.d [blink] |
||||
add r0,r0,r1 |
||||
#endif /* ENDIAN */ |
||||
.Learly_end: |
||||
b.d .Lend |
||||
sub_s.ne r1,r1,r1 |
||||
ARC_EXIT strlen |
Loading…
Reference in new issue