mirror of
https://github.com/ziglang/zig.git
synced 2026-02-12 20:37:54 +00:00
remove memcpy and memmove from bundled libcs
These are provided instead by compiler_rt. Part of #2879
This commit is contained in:
parent
c748eb2416
commit
e6dc85f1b4
186
lib/libc/musl/src/string/aarch64/memcpy.S
vendored
186
lib/libc/musl/src/string/aarch64/memcpy.S
vendored
@ -1,186 +0,0 @@
|
||||
/*
|
||||
* memcpy - copy memory area
|
||||
*
|
||||
* Copyright (c) 2012-2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define src x1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define srcend x4
|
||||
#define dstend x5
|
||||
#define A_l x6
|
||||
#define A_lw w6
|
||||
#define A_h x7
|
||||
#define B_l x8
|
||||
#define B_lw w8
|
||||
#define B_h x9
|
||||
#define C_l x10
|
||||
#define C_lw w10
|
||||
#define C_h x11
|
||||
#define D_l x12
|
||||
#define D_h x13
|
||||
#define E_l x14
|
||||
#define E_h x15
|
||||
#define F_l x16
|
||||
#define F_h x17
|
||||
#define G_l count
|
||||
#define G_h dst
|
||||
#define H_l src
|
||||
#define H_h srcend
|
||||
#define tmp1 x14
|
||||
|
||||
/* This implementation of memcpy uses unaligned accesses and branchless
|
||||
sequences to keep the code small, simple and improve performance.
|
||||
|
||||
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
|
||||
copies of up to 128 bytes, and large copies. The overhead of the overlap
|
||||
check is negligible since it is only required for large copies.
|
||||
|
||||
Large copies use a software pipelined loop processing 64 bytes per iteration.
|
||||
The destination pointer is 16-byte aligned to minimize unaligned accesses.
|
||||
The loop tail is handled by always copying 64 bytes from the end.
|
||||
*/
|
||||
|
||||
.global memcpy
|
||||
.type memcpy,%function
|
||||
memcpy:
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
cmp count, 128
|
||||
b.hi .Lcopy_long
|
||||
cmp count, 32
|
||||
b.hi .Lcopy32_128
|
||||
|
||||
/* Small copies: 0..32 bytes. */
|
||||
cmp count, 16
|
||||
b.lo .Lcopy16
|
||||
ldp A_l, A_h, [src]
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
stp A_l, A_h, [dstin]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
/* Copy 8-15 bytes. */
|
||||
.Lcopy16:
|
||||
tbz count, 3, .Lcopy8
|
||||
ldr A_l, [src]
|
||||
ldr A_h, [srcend, -8]
|
||||
str A_l, [dstin]
|
||||
str A_h, [dstend, -8]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
/* Copy 4-7 bytes. */
|
||||
.Lcopy8:
|
||||
tbz count, 2, .Lcopy4
|
||||
ldr A_lw, [src]
|
||||
ldr B_lw, [srcend, -4]
|
||||
str A_lw, [dstin]
|
||||
str B_lw, [dstend, -4]
|
||||
ret
|
||||
|
||||
/* Copy 0..3 bytes using a branchless sequence. */
|
||||
.Lcopy4:
|
||||
cbz count, .Lcopy0
|
||||
lsr tmp1, count, 1
|
||||
ldrb A_lw, [src]
|
||||
ldrb C_lw, [srcend, -1]
|
||||
ldrb B_lw, [src, tmp1]
|
||||
strb A_lw, [dstin]
|
||||
strb B_lw, [dstin, tmp1]
|
||||
strb C_lw, [dstend, -1]
|
||||
.Lcopy0:
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Medium copies: 33..128 bytes. */
|
||||
.Lcopy32_128:
|
||||
ldp A_l, A_h, [src]
|
||||
ldp B_l, B_h, [src, 16]
|
||||
ldp C_l, C_h, [srcend, -32]
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
cmp count, 64
|
||||
b.hi .Lcopy128
|
||||
stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstend, -32]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy 65..128 bytes. */
|
||||
.Lcopy128:
|
||||
ldp E_l, E_h, [src, 32]
|
||||
ldp F_l, F_h, [src, 48]
|
||||
cmp count, 96
|
||||
b.ls .Lcopy96
|
||||
ldp G_l, G_h, [srcend, -64]
|
||||
ldp H_l, H_h, [srcend, -48]
|
||||
stp G_l, G_h, [dstend, -64]
|
||||
stp H_l, H_h, [dstend, -48]
|
||||
.Lcopy96:
|
||||
stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp E_l, E_h, [dstin, 32]
|
||||
stp F_l, F_h, [dstin, 48]
|
||||
stp C_l, C_h, [dstend, -32]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy more than 128 bytes. */
|
||||
.Lcopy_long:
|
||||
|
||||
/* Copy 16 bytes and then align dst to 16-byte alignment. */
|
||||
|
||||
ldp D_l, D_h, [src]
|
||||
and tmp1, dstin, 15
|
||||
bic dst, dstin, 15
|
||||
sub src, src, tmp1
|
||||
add count, count, tmp1 /* Count is now 16 too large. */
|
||||
ldp A_l, A_h, [src, 16]
|
||||
stp D_l, D_h, [dstin]
|
||||
ldp B_l, B_h, [src, 32]
|
||||
ldp C_l, C_h, [src, 48]
|
||||
ldp D_l, D_h, [src, 64]!
|
||||
subs count, count, 128 + 16 /* Test and readjust count. */
|
||||
b.ls .Lcopy64_from_end
|
||||
|
||||
.Lloop64:
|
||||
stp A_l, A_h, [dst, 16]
|
||||
ldp A_l, A_h, [src, 16]
|
||||
stp B_l, B_h, [dst, 32]
|
||||
ldp B_l, B_h, [src, 32]
|
||||
stp C_l, C_h, [dst, 48]
|
||||
ldp C_l, C_h, [src, 48]
|
||||
stp D_l, D_h, [dst, 64]!
|
||||
ldp D_l, D_h, [src, 64]!
|
||||
subs count, count, 64
|
||||
b.hi .Lloop64
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the end. */
|
||||
.Lcopy64_from_end:
|
||||
ldp E_l, E_h, [srcend, -64]
|
||||
stp A_l, A_h, [dst, 16]
|
||||
ldp A_l, A_h, [srcend, -48]
|
||||
stp B_l, B_h, [dst, 32]
|
||||
ldp B_l, B_h, [srcend, -32]
|
||||
stp C_l, C_h, [dst, 48]
|
||||
ldp C_l, C_h, [srcend, -16]
|
||||
stp D_l, D_h, [dst, 64]
|
||||
stp E_l, E_h, [dstend, -64]
|
||||
stp A_l, A_h, [dstend, -48]
|
||||
stp B_l, B_h, [dstend, -32]
|
||||
stp C_l, C_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.size memcpy,.-memcpy
|
||||
45
lib/libc/musl/src/string/arm/__aeabi_memcpy.s
vendored
45
lib/libc/musl/src/string/arm/__aeabi_memcpy.s
vendored
@ -1,45 +0,0 @@
|
||||
.syntax unified
|
||||
|
||||
.global __aeabi_memcpy8
|
||||
.global __aeabi_memcpy4
|
||||
.global __aeabi_memcpy
|
||||
.global __aeabi_memmove8
|
||||
.global __aeabi_memmove4
|
||||
.global __aeabi_memmove
|
||||
|
||||
.type __aeabi_memcpy8,%function
|
||||
.type __aeabi_memcpy4,%function
|
||||
.type __aeabi_memcpy,%function
|
||||
.type __aeabi_memmove8,%function
|
||||
.type __aeabi_memmove4,%function
|
||||
.type __aeabi_memmove,%function
|
||||
|
||||
__aeabi_memmove8:
|
||||
__aeabi_memmove4:
|
||||
__aeabi_memmove:
|
||||
cmp r0, r1
|
||||
bls 3f
|
||||
cmp r2, #0
|
||||
beq 2f
|
||||
adds r0, r0, r2
|
||||
adds r2, r1, r2
|
||||
1: subs r2, r2, #1
|
||||
ldrb r3, [r2]
|
||||
subs r0, r0, #1
|
||||
strb r3, [r0]
|
||||
cmp r1, r2
|
||||
bne 1b
|
||||
2: bx lr
|
||||
__aeabi_memcpy8:
|
||||
__aeabi_memcpy4:
|
||||
__aeabi_memcpy:
|
||||
3: cmp r2, #0
|
||||
beq 2f
|
||||
adds r2, r1, r2
|
||||
1: ldrb r3, [r1]
|
||||
adds r1, r1, #1
|
||||
strb r3, [r0]
|
||||
adds r0, r0, #1
|
||||
cmp r1, r2
|
||||
bne 1b
|
||||
2: bx lr
|
||||
479
lib/libc/musl/src/string/arm/memcpy.S
vendored
479
lib/libc/musl/src/string/arm/memcpy.S
vendored
@ -1,479 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2008 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Optimized memcpy() for ARM.
|
||||
*
|
||||
* note that memcpy() always returns the destination pointer,
|
||||
* so we have to preserve R0.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file has been modified from the original for use in musl libc.
|
||||
* The main changes are: addition of .type memcpy,%function to make the
|
||||
* code safely callable from thumb mode, adjusting the return
|
||||
* instructions to be compatible with pre-thumb ARM cpus, removal of
|
||||
* prefetch code that is not compatible with older cpus and support for
|
||||
* building as thumb 2 and big-endian.
|
||||
*/
|
||||
|
||||
.syntax unified
|
||||
|
||||
.global memcpy
|
||||
.type memcpy,%function
|
||||
memcpy:
|
||||
/* The stack must always be 64-bits aligned to be compliant with the
|
||||
* ARM ABI. Since we have to save R0, we might as well save R4
|
||||
* which we can use for better pipelining of the reads below
|
||||
*/
|
||||
.fnstart
|
||||
.save {r0, r4, lr}
|
||||
stmfd sp!, {r0, r4, lr}
|
||||
/* Making room for r5-r11 which will be spilled later */
|
||||
.pad #28
|
||||
sub sp, sp, #28
|
||||
|
||||
/* it simplifies things to take care of len<4 early */
|
||||
cmp r2, #4
|
||||
blo copy_last_3_and_return
|
||||
|
||||
/* compute the offset to align the source
|
||||
* offset = (4-(src&3))&3 = -src & 3
|
||||
*/
|
||||
rsb r3, r1, #0
|
||||
ands r3, r3, #3
|
||||
beq src_aligned
|
||||
|
||||
/* align source to 32 bits. We need to insert 2 instructions between
|
||||
* a ldr[b|h] and str[b|h] because byte and half-word instructions
|
||||
* stall 2 cycles.
|
||||
*/
|
||||
movs r12, r3, lsl #31
|
||||
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
|
||||
ldrbmi r3, [r1], #1
|
||||
ldrbcs r4, [r1], #1
|
||||
ldrbcs r12,[r1], #1
|
||||
strbmi r3, [r0], #1
|
||||
strbcs r4, [r0], #1
|
||||
strbcs r12,[r0], #1
|
||||
|
||||
src_aligned:
|
||||
|
||||
/* see if src and dst are aligned together (congruent) */
|
||||
eor r12, r0, r1
|
||||
tst r12, #3
|
||||
bne non_congruent
|
||||
|
||||
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
||||
* frame. Don't update sp.
|
||||
*/
|
||||
stmea sp, {r5-r11}
|
||||
|
||||
/* align the destination to a cache-line */
|
||||
rsb r3, r0, #0
|
||||
ands r3, r3, #0x1C
|
||||
beq congruent_aligned32
|
||||
cmp r3, r2
|
||||
andhi r3, r2, #0x1C
|
||||
|
||||
/* conditionnaly copies 0 to 7 words (length in r3) */
|
||||
movs r12, r3, lsl #28
|
||||
ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
|
||||
ldmmi r1!, {r8, r9} /* 8 bytes */
|
||||
stmcs r0!, {r4, r5, r6, r7}
|
||||
stmmi r0!, {r8, r9}
|
||||
tst r3, #0x4
|
||||
ldrne r10,[r1], #4 /* 4 bytes */
|
||||
strne r10,[r0], #4
|
||||
sub r2, r2, r3
|
||||
|
||||
congruent_aligned32:
|
||||
/*
|
||||
* here source is aligned to 32 bytes.
|
||||
*/
|
||||
|
||||
cached_aligned32:
|
||||
subs r2, r2, #32
|
||||
blo less_than_32_left
|
||||
|
||||
/*
|
||||
* We preload a cache-line up to 64 bytes ahead. On the 926, this will
|
||||
* stall only until the requested world is fetched, but the linefill
|
||||
* continues in the the background.
|
||||
* While the linefill is going, we write our previous cache-line
|
||||
* into the write-buffer (which should have some free space).
|
||||
* When the linefill is done, the writebuffer will
|
||||
* start dumping its content into memory
|
||||
*
|
||||
* While all this is going, we then load a full cache line into
|
||||
* 8 registers, this cache line should be in the cache by now
|
||||
* (or partly in the cache).
|
||||
*
|
||||
* This code should work well regardless of the source/dest alignment.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Align the preload register to a cache-line because the cpu does
|
||||
* "critical word first" (the first word requested is loaded first).
|
||||
*/
|
||||
@ bic r12, r1, #0x1F
|
||||
@ add r12, r12, #64
|
||||
|
||||
1: ldmia r1!, { r4-r11 }
|
||||
subs r2, r2, #32
|
||||
|
||||
/*
|
||||
* NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
|
||||
* for ARM9 preload will not be safely guarded by the preceding subs.
|
||||
* When it is safely guarded the only possibility to have SIGSEGV here
|
||||
* is because the caller overstates the length.
|
||||
*/
|
||||
@ ldrhi r3, [r12], #32 /* cheap ARM9 preload */
|
||||
stmia r0!, { r4-r11 }
|
||||
bhs 1b
|
||||
|
||||
add r2, r2, #32
|
||||
|
||||
less_than_32_left:
|
||||
/*
|
||||
* less than 32 bytes left at this point (length in r2)
|
||||
*/
|
||||
|
||||
/* skip all this if there is nothing to do, which should
|
||||
* be a common case (if not executed the code below takes
|
||||
* about 16 cycles)
|
||||
*/
|
||||
tst r2, #0x1F
|
||||
beq 1f
|
||||
|
||||
/* conditionnaly copies 0 to 31 bytes */
|
||||
movs r12, r2, lsl #28
|
||||
ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
|
||||
ldmmi r1!, {r8, r9} /* 8 bytes */
|
||||
stmcs r0!, {r4, r5, r6, r7}
|
||||
stmmi r0!, {r8, r9}
|
||||
movs r12, r2, lsl #30
|
||||
ldrcs r3, [r1], #4 /* 4 bytes */
|
||||
ldrhmi r4, [r1], #2 /* 2 bytes */
|
||||
strcs r3, [r0], #4
|
||||
strhmi r4, [r0], #2
|
||||
tst r2, #0x1
|
||||
ldrbne r3, [r1] /* last byte */
|
||||
strbne r3, [r0]
|
||||
|
||||
/* we're done! restore everything and return */
|
||||
1: ldmfd sp!, {r5-r11}
|
||||
ldmfd sp!, {r0, r4, lr}
|
||||
bx lr
|
||||
|
||||
/********************************************************************/
|
||||
|
||||
non_congruent:
|
||||
/*
|
||||
* here source is aligned to 4 bytes
|
||||
* but destination is not.
|
||||
*
|
||||
* in the code below r2 is the number of bytes read
|
||||
* (the number of bytes written is always smaller, because we have
|
||||
* partial words in the shift queue)
|
||||
*/
|
||||
cmp r2, #4
|
||||
blo copy_last_3_and_return
|
||||
|
||||
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
||||
* frame. Don't update sp.
|
||||
*/
|
||||
stmea sp, {r5-r11}
|
||||
|
||||
/* compute shifts needed to align src to dest */
|
||||
rsb r5, r0, #0
|
||||
and r5, r5, #3 /* r5 = # bytes in partial words */
|
||||
mov r12, r5, lsl #3 /* r12 = right */
|
||||
rsb lr, r12, #32 /* lr = left */
|
||||
|
||||
/* read the first word */
|
||||
ldr r3, [r1], #4
|
||||
sub r2, r2, #4
|
||||
|
||||
/* write a partial word (0 to 3 bytes), such that destination
|
||||
* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
|
||||
*/
|
||||
movs r5, r5, lsl #31
|
||||
|
||||
#if __ARMEB__
|
||||
movmi r3, r3, ror #24
|
||||
strbmi r3, [r0], #1
|
||||
movcs r3, r3, ror #24
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, ror #24
|
||||
strbcs r3, [r0], #1
|
||||
#else
|
||||
strbmi r3, [r0], #1
|
||||
movmi r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
#endif
|
||||
|
||||
cmp r2, #4
|
||||
blo partial_word_tail
|
||||
|
||||
#if __ARMEB__
|
||||
mov r3, r3, lsr r12
|
||||
mov r3, r3, lsl r12
|
||||
#endif
|
||||
|
||||
/* Align destination to 32 bytes (cache line boundary) */
|
||||
1: tst r0, #0x1c
|
||||
beq 2f
|
||||
ldr r5, [r1], #4
|
||||
sub r2, r2, #4
|
||||
#if __ARMEB__
|
||||
mov r4, r5, lsr lr
|
||||
orr r4, r4, r3
|
||||
mov r3, r5, lsl r12
|
||||
#else
|
||||
mov r4, r5, lsl lr
|
||||
orr r4, r4, r3
|
||||
mov r3, r5, lsr r12
|
||||
#endif
|
||||
str r4, [r0], #4
|
||||
cmp r2, #4
|
||||
bhs 1b
|
||||
blo partial_word_tail
|
||||
|
||||
/* copy 32 bytes at a time */
|
||||
2: subs r2, r2, #32
|
||||
blo less_than_thirtytwo
|
||||
|
||||
/* Use immediate mode for the shifts, because there is an extra cycle
|
||||
* for register shifts, which could account for up to 50% of
|
||||
* performance hit.
|
||||
*/
|
||||
|
||||
cmp r12, #24
|
||||
beq loop24
|
||||
cmp r12, #8
|
||||
beq loop8
|
||||
|
||||
loop16:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
#if __ARMEB__
|
||||
orr r3, r3, r4, lsr #16
|
||||
mov r4, r4, lsl #16
|
||||
orr r4, r4, r5, lsr #16
|
||||
mov r5, r5, lsl #16
|
||||
orr r5, r5, r6, lsr #16
|
||||
mov r6, r6, lsl #16
|
||||
orr r6, r6, r7, lsr #16
|
||||
mov r7, r7, lsl #16
|
||||
orr r7, r7, r8, lsr #16
|
||||
mov r8, r8, lsl #16
|
||||
orr r8, r8, r9, lsr #16
|
||||
mov r9, r9, lsl #16
|
||||
orr r9, r9, r10, lsr #16
|
||||
mov r10, r10, lsl #16
|
||||
orr r10, r10, r11, lsr #16
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsl #16
|
||||
#else
|
||||
orr r3, r3, r4, lsl #16
|
||||
mov r4, r4, lsr #16
|
||||
orr r4, r4, r5, lsl #16
|
||||
mov r5, r5, lsr #16
|
||||
orr r5, r5, r6, lsl #16
|
||||
mov r6, r6, lsr #16
|
||||
orr r6, r6, r7, lsl #16
|
||||
mov r7, r7, lsr #16
|
||||
orr r7, r7, r8, lsl #16
|
||||
mov r8, r8, lsr #16
|
||||
orr r8, r8, r9, lsl #16
|
||||
mov r9, r9, lsr #16
|
||||
orr r9, r9, r10, lsl #16
|
||||
mov r10, r10, lsr #16
|
||||
orr r10, r10, r11, lsl #16
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #16
|
||||
#endif
|
||||
bhs 1b
|
||||
b less_than_thirtytwo
|
||||
|
||||
loop8:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
#if __ARMEB__
|
||||
orr r3, r3, r4, lsr #24
|
||||
mov r4, r4, lsl #8
|
||||
orr r4, r4, r5, lsr #24
|
||||
mov r5, r5, lsl #8
|
||||
orr r5, r5, r6, lsr #24
|
||||
mov r6, r6, lsl #8
|
||||
orr r6, r6, r7, lsr #24
|
||||
mov r7, r7, lsl #8
|
||||
orr r7, r7, r8, lsr #24
|
||||
mov r8, r8, lsl #8
|
||||
orr r8, r8, r9, lsr #24
|
||||
mov r9, r9, lsl #8
|
||||
orr r9, r9, r10, lsr #24
|
||||
mov r10, r10, lsl #8
|
||||
orr r10, r10, r11, lsr #24
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsl #8
|
||||
#else
|
||||
orr r3, r3, r4, lsl #24
|
||||
mov r4, r4, lsr #8
|
||||
orr r4, r4, r5, lsl #24
|
||||
mov r5, r5, lsr #8
|
||||
orr r5, r5, r6, lsl #24
|
||||
mov r6, r6, lsr #8
|
||||
orr r6, r6, r7, lsl #24
|
||||
mov r7, r7, lsr #8
|
||||
orr r7, r7, r8, lsl #24
|
||||
mov r8, r8, lsr #8
|
||||
orr r8, r8, r9, lsl #24
|
||||
mov r9, r9, lsr #8
|
||||
orr r9, r9, r10, lsl #24
|
||||
mov r10, r10, lsr #8
|
||||
orr r10, r10, r11, lsl #24
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #8
|
||||
#endif
|
||||
bhs 1b
|
||||
b less_than_thirtytwo
|
||||
|
||||
loop24:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
#if __ARMEB__
|
||||
orr r3, r3, r4, lsr #8
|
||||
mov r4, r4, lsl #24
|
||||
orr r4, r4, r5, lsr #8
|
||||
mov r5, r5, lsl #24
|
||||
orr r5, r5, r6, lsr #8
|
||||
mov r6, r6, lsl #24
|
||||
orr r6, r6, r7, lsr #8
|
||||
mov r7, r7, lsl #24
|
||||
orr r7, r7, r8, lsr #8
|
||||
mov r8, r8, lsl #24
|
||||
orr r8, r8, r9, lsr #8
|
||||
mov r9, r9, lsl #24
|
||||
orr r9, r9, r10, lsr #8
|
||||
mov r10, r10, lsl #24
|
||||
orr r10, r10, r11, lsr #8
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsl #24
|
||||
#else
|
||||
orr r3, r3, r4, lsl #8
|
||||
mov r4, r4, lsr #24
|
||||
orr r4, r4, r5, lsl #8
|
||||
mov r5, r5, lsr #24
|
||||
orr r5, r5, r6, lsl #8
|
||||
mov r6, r6, lsr #24
|
||||
orr r6, r6, r7, lsl #8
|
||||
mov r7, r7, lsr #24
|
||||
orr r7, r7, r8, lsl #8
|
||||
mov r8, r8, lsr #24
|
||||
orr r8, r8, r9, lsl #8
|
||||
mov r9, r9, lsr #24
|
||||
orr r9, r9, r10, lsl #8
|
||||
mov r10, r10, lsr #24
|
||||
orr r10, r10, r11, lsl #8
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #24
|
||||
#endif
|
||||
bhs 1b
|
||||
|
||||
less_than_thirtytwo:
|
||||
/* copy the last 0 to 31 bytes of the source */
|
||||
rsb r12, lr, #32 /* we corrupted r12, recompute it */
|
||||
add r2, r2, #32
|
||||
cmp r2, #4
|
||||
blo partial_word_tail
|
||||
|
||||
1: ldr r5, [r1], #4
|
||||
sub r2, r2, #4
|
||||
#if __ARMEB__
|
||||
mov r4, r5, lsr lr
|
||||
orr r4, r4, r3
|
||||
mov r3, r5, lsl r12
|
||||
#else
|
||||
mov r4, r5, lsl lr
|
||||
orr r4, r4, r3
|
||||
mov r3, r5, lsr r12
|
||||
#endif
|
||||
str r4, [r0], #4
|
||||
cmp r2, #4
|
||||
bhs 1b
|
||||
|
||||
partial_word_tail:
|
||||
/* we have a partial word in the input buffer */
|
||||
movs r5, lr, lsl #(31-3)
|
||||
#if __ARMEB__
|
||||
movmi r3, r3, ror #24
|
||||
strbmi r3, [r0], #1
|
||||
movcs r3, r3, ror #24
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, ror #24
|
||||
strbcs r3, [r0], #1
|
||||
#else
|
||||
strbmi r3, [r0], #1
|
||||
movmi r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
#endif
|
||||
|
||||
/* Refill spilled registers from the stack. Don't update sp. */
|
||||
ldmfd sp, {r5-r11}
|
||||
|
||||
copy_last_3_and_return:
|
||||
movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
|
||||
ldrbmi r2, [r1], #1
|
||||
ldrbcs r3, [r1], #1
|
||||
ldrbcs r12,[r1]
|
||||
strbmi r2, [r0], #1
|
||||
strbcs r3, [r0], #1
|
||||
strbcs r12,[r0]
|
||||
|
||||
/* we're done! restore sp and spilled registers and return */
|
||||
add sp, sp, #28
|
||||
ldmfd sp!, {r0, r4, lr}
|
||||
bx lr
|
||||
|
||||
32
lib/libc/musl/src/string/i386/memcpy.s
vendored
32
lib/libc/musl/src/string/i386/memcpy.s
vendored
@ -1,32 +0,0 @@
|
||||
.global memcpy
|
||||
.global __memcpy_fwd
|
||||
.hidden __memcpy_fwd
|
||||
.type memcpy,@function
|
||||
memcpy:
|
||||
__memcpy_fwd:
|
||||
push %esi
|
||||
push %edi
|
||||
mov 12(%esp),%edi
|
||||
mov 16(%esp),%esi
|
||||
mov 20(%esp),%ecx
|
||||
mov %edi,%eax
|
||||
cmp $4,%ecx
|
||||
jc 1f
|
||||
test $3,%edi
|
||||
jz 1f
|
||||
2: movsb
|
||||
dec %ecx
|
||||
test $3,%edi
|
||||
jnz 2b
|
||||
1: mov %ecx,%edx
|
||||
shr $2,%ecx
|
||||
rep
|
||||
movsl
|
||||
and $3,%edx
|
||||
jz 1f
|
||||
2: movsb
|
||||
dec %edx
|
||||
jnz 2b
|
||||
1: pop %edi
|
||||
pop %esi
|
||||
ret
|
||||
22
lib/libc/musl/src/string/i386/memmove.s
vendored
22
lib/libc/musl/src/string/i386/memmove.s
vendored
@ -1,22 +0,0 @@
|
||||
.global memmove
|
||||
.type memmove,@function
|
||||
memmove:
|
||||
mov 4(%esp),%eax
|
||||
sub 8(%esp),%eax
|
||||
cmp 12(%esp),%eax
|
||||
.hidden __memcpy_fwd
|
||||
jae __memcpy_fwd
|
||||
push %esi
|
||||
push %edi
|
||||
mov 12(%esp),%edi
|
||||
mov 16(%esp),%esi
|
||||
mov 20(%esp),%ecx
|
||||
lea -1(%edi,%ecx),%edi
|
||||
lea -1(%esi,%ecx),%esi
|
||||
std
|
||||
rep movsb
|
||||
cld
|
||||
lea 1(%edi),%eax
|
||||
pop %edi
|
||||
pop %esi
|
||||
ret
|
||||
124
lib/libc/musl/src/string/memcpy.c
vendored
124
lib/libc/musl/src/string/memcpy.c
vendored
@ -1,124 +0,0 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <endian.h>
|
||||
|
||||
void *memcpy(void *restrict dest, const void *restrict src, size_t n)
|
||||
{
|
||||
unsigned char *d = dest;
|
||||
const unsigned char *s = src;
|
||||
|
||||
#ifdef __GNUC__
|
||||
|
||||
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||
#define LS >>
|
||||
#define RS <<
|
||||
#else
|
||||
#define LS <<
|
||||
#define RS >>
|
||||
#endif
|
||||
|
||||
typedef uint32_t __attribute__((__may_alias__)) u32;
|
||||
uint32_t w, x;
|
||||
|
||||
for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++;
|
||||
|
||||
if ((uintptr_t)d % 4 == 0) {
|
||||
for (; n>=16; s+=16, d+=16, n-=16) {
|
||||
*(u32 *)(d+0) = *(u32 *)(s+0);
|
||||
*(u32 *)(d+4) = *(u32 *)(s+4);
|
||||
*(u32 *)(d+8) = *(u32 *)(s+8);
|
||||
*(u32 *)(d+12) = *(u32 *)(s+12);
|
||||
}
|
||||
if (n&8) {
|
||||
*(u32 *)(d+0) = *(u32 *)(s+0);
|
||||
*(u32 *)(d+4) = *(u32 *)(s+4);
|
||||
d += 8; s += 8;
|
||||
}
|
||||
if (n&4) {
|
||||
*(u32 *)(d+0) = *(u32 *)(s+0);
|
||||
d += 4; s += 4;
|
||||
}
|
||||
if (n&2) {
|
||||
*d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&1) {
|
||||
*d = *s;
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
if (n >= 32) switch ((uintptr_t)d % 4) {
|
||||
case 1:
|
||||
w = *(u32 *)s;
|
||||
*d++ = *s++;
|
||||
*d++ = *s++;
|
||||
*d++ = *s++;
|
||||
n -= 3;
|
||||
for (; n>=17; s+=16, d+=16, n-=16) {
|
||||
x = *(u32 *)(s+1);
|
||||
*(u32 *)(d+0) = (w LS 24) | (x RS 8);
|
||||
w = *(u32 *)(s+5);
|
||||
*(u32 *)(d+4) = (x LS 24) | (w RS 8);
|
||||
x = *(u32 *)(s+9);
|
||||
*(u32 *)(d+8) = (w LS 24) | (x RS 8);
|
||||
w = *(u32 *)(s+13);
|
||||
*(u32 *)(d+12) = (x LS 24) | (w RS 8);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
w = *(u32 *)s;
|
||||
*d++ = *s++;
|
||||
*d++ = *s++;
|
||||
n -= 2;
|
||||
for (; n>=18; s+=16, d+=16, n-=16) {
|
||||
x = *(u32 *)(s+2);
|
||||
*(u32 *)(d+0) = (w LS 16) | (x RS 16);
|
||||
w = *(u32 *)(s+6);
|
||||
*(u32 *)(d+4) = (x LS 16) | (w RS 16);
|
||||
x = *(u32 *)(s+10);
|
||||
*(u32 *)(d+8) = (w LS 16) | (x RS 16);
|
||||
w = *(u32 *)(s+14);
|
||||
*(u32 *)(d+12) = (x LS 16) | (w RS 16);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
w = *(u32 *)s;
|
||||
*d++ = *s++;
|
||||
n -= 1;
|
||||
for (; n>=19; s+=16, d+=16, n-=16) {
|
||||
x = *(u32 *)(s+3);
|
||||
*(u32 *)(d+0) = (w LS 8) | (x RS 24);
|
||||
w = *(u32 *)(s+7);
|
||||
*(u32 *)(d+4) = (x LS 8) | (w RS 24);
|
||||
x = *(u32 *)(s+11);
|
||||
*(u32 *)(d+8) = (w LS 8) | (x RS 24);
|
||||
w = *(u32 *)(s+15);
|
||||
*(u32 *)(d+12) = (x LS 8) | (w RS 24);
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (n&16) {
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&8) {
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&4) {
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&2) {
|
||||
*d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&1) {
|
||||
*d = *s;
|
||||
}
|
||||
return dest;
|
||||
#endif
|
||||
|
||||
for (; n; n--) *d++ = *s++;
|
||||
return dest;
|
||||
}
|
||||
42
lib/libc/musl/src/string/memmove.c
vendored
42
lib/libc/musl/src/string/memmove.c
vendored
@ -1,42 +0,0 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __GNUC__
|
||||
typedef __attribute__((__may_alias__)) size_t WT;
|
||||
#define WS (sizeof(WT))
|
||||
#endif
|
||||
|
||||
void *memmove(void *dest, const void *src, size_t n)
|
||||
{
|
||||
char *d = dest;
|
||||
const char *s = src;
|
||||
|
||||
if (d==s) return d;
|
||||
if ((uintptr_t)s-(uintptr_t)d-n <= -2*n) return memcpy(d, s, n);
|
||||
|
||||
if (d<s) {
|
||||
#ifdef __GNUC__
|
||||
if ((uintptr_t)s % WS == (uintptr_t)d % WS) {
|
||||
while ((uintptr_t)d % WS) {
|
||||
if (!n--) return dest;
|
||||
*d++ = *s++;
|
||||
}
|
||||
for (; n>=WS; n-=WS, d+=WS, s+=WS) *(WT *)d = *(WT *)s;
|
||||
}
|
||||
#endif
|
||||
for (; n; n--) *d++ = *s++;
|
||||
} else {
|
||||
#ifdef __GNUC__
|
||||
if ((uintptr_t)s % WS == (uintptr_t)d % WS) {
|
||||
while ((uintptr_t)(d+n) % WS) {
|
||||
if (!n--) return dest;
|
||||
d[n] = s[n];
|
||||
}
|
||||
while (n>=WS) n-=WS, *(WT *)(d+n) = *(WT *)(s+n);
|
||||
}
|
||||
#endif
|
||||
while (n) n--, d[n] = s[n];
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
25
lib/libc/musl/src/string/x86_64/memcpy.s
vendored
25
lib/libc/musl/src/string/x86_64/memcpy.s
vendored
@ -1,25 +0,0 @@
|
||||
.global memcpy
|
||||
.global __memcpy_fwd
|
||||
.hidden __memcpy_fwd
|
||||
.type memcpy,@function
|
||||
memcpy:
|
||||
__memcpy_fwd:
|
||||
mov %rdi,%rax
|
||||
cmp $8,%rdx
|
||||
jc 1f
|
||||
test $7,%edi
|
||||
jz 1f
|
||||
2: movsb
|
||||
dec %rdx
|
||||
test $7,%edi
|
||||
jnz 2b
|
||||
1: mov %rdx,%rcx
|
||||
shr $3,%rcx
|
||||
rep
|
||||
movsq
|
||||
and $7,%edx
|
||||
jz 1f
|
||||
2: movsb
|
||||
dec %edx
|
||||
jnz 2b
|
||||
1: ret
|
||||
16
lib/libc/musl/src/string/x86_64/memmove.s
vendored
16
lib/libc/musl/src/string/x86_64/memmove.s
vendored
@ -1,16 +0,0 @@
|
||||
.global memmove
|
||||
.type memmove,@function
|
||||
memmove:
|
||||
mov %rdi,%rax
|
||||
sub %rsi,%rax
|
||||
cmp %rdx,%rax
|
||||
.hidden __memcpy_fwd
|
||||
jae __memcpy_fwd
|
||||
mov %rdx,%rcx
|
||||
lea -1(%rdi,%rdx),%rdi
|
||||
lea -1(%rsi,%rdx),%rsi
|
||||
std
|
||||
rep movsb
|
||||
cld
|
||||
lea 1(%rdi),%rax
|
||||
ret
|
||||
@ -1,186 +0,0 @@
|
||||
/*
|
||||
* memcpy - copy memory area
|
||||
*
|
||||
* Copyright (c) 2012-2020, Arm Limited.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
/* Assumptions:
|
||||
*
|
||||
* ARMv8-a, AArch64, unaligned accesses.
|
||||
*
|
||||
*/
|
||||
|
||||
#define dstin x0
|
||||
#define src x1
|
||||
#define count x2
|
||||
#define dst x3
|
||||
#define srcend x4
|
||||
#define dstend x5
|
||||
#define A_l x6
|
||||
#define A_lw w6
|
||||
#define A_h x7
|
||||
#define B_l x8
|
||||
#define B_lw w8
|
||||
#define B_h x9
|
||||
#define C_l x10
|
||||
#define C_lw w10
|
||||
#define C_h x11
|
||||
#define D_l x12
|
||||
#define D_h x13
|
||||
#define E_l x14
|
||||
#define E_h x15
|
||||
#define F_l x16
|
||||
#define F_h x17
|
||||
#define G_l count
|
||||
#define G_h dst
|
||||
#define H_l src
|
||||
#define H_h srcend
|
||||
#define tmp1 x14
|
||||
|
||||
/* This implementation of memcpy uses unaligned accesses and branchless
|
||||
sequences to keep the code small, simple and improve performance.
|
||||
|
||||
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
|
||||
copies of up to 128 bytes, and large copies. The overhead of the overlap
|
||||
check is negligible since it is only required for large copies.
|
||||
|
||||
Large copies use a software pipelined loop processing 64 bytes per iteration.
|
||||
The destination pointer is 16-byte aligned to minimize unaligned accesses.
|
||||
The loop tail is handled by always copying 64 bytes from the end.
|
||||
*/
|
||||
|
||||
.global memcpy
|
||||
.type memcpy,%function
|
||||
memcpy:
|
||||
add srcend, src, count
|
||||
add dstend, dstin, count
|
||||
cmp count, 128
|
||||
b.hi .Lcopy_long
|
||||
cmp count, 32
|
||||
b.hi .Lcopy32_128
|
||||
|
||||
/* Small copies: 0..32 bytes. */
|
||||
cmp count, 16
|
||||
b.lo .Lcopy16
|
||||
ldp A_l, A_h, [src]
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
stp A_l, A_h, [dstin]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
/* Copy 8-15 bytes. */
|
||||
.Lcopy16:
|
||||
tbz count, 3, .Lcopy8
|
||||
ldr A_l, [src]
|
||||
ldr A_h, [srcend, -8]
|
||||
str A_l, [dstin]
|
||||
str A_h, [dstend, -8]
|
||||
ret
|
||||
|
||||
.p2align 3
|
||||
/* Copy 4-7 bytes. */
|
||||
.Lcopy8:
|
||||
tbz count, 2, .Lcopy4
|
||||
ldr A_lw, [src]
|
||||
ldr B_lw, [srcend, -4]
|
||||
str A_lw, [dstin]
|
||||
str B_lw, [dstend, -4]
|
||||
ret
|
||||
|
||||
/* Copy 0..3 bytes using a branchless sequence. */
|
||||
.Lcopy4:
|
||||
cbz count, .Lcopy0
|
||||
lsr tmp1, count, 1
|
||||
ldrb A_lw, [src]
|
||||
ldrb C_lw, [srcend, -1]
|
||||
ldrb B_lw, [src, tmp1]
|
||||
strb A_lw, [dstin]
|
||||
strb B_lw, [dstin, tmp1]
|
||||
strb C_lw, [dstend, -1]
|
||||
.Lcopy0:
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Medium copies: 33..128 bytes. */
|
||||
.Lcopy32_128:
|
||||
ldp A_l, A_h, [src]
|
||||
ldp B_l, B_h, [src, 16]
|
||||
ldp C_l, C_h, [srcend, -32]
|
||||
ldp D_l, D_h, [srcend, -16]
|
||||
cmp count, 64
|
||||
b.hi .Lcopy128
|
||||
stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp C_l, C_h, [dstend, -32]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy 65..128 bytes. */
|
||||
.Lcopy128:
|
||||
ldp E_l, E_h, [src, 32]
|
||||
ldp F_l, F_h, [src, 48]
|
||||
cmp count, 96
|
||||
b.ls .Lcopy96
|
||||
ldp G_l, G_h, [srcend, -64]
|
||||
ldp H_l, H_h, [srcend, -48]
|
||||
stp G_l, G_h, [dstend, -64]
|
||||
stp H_l, H_h, [dstend, -48]
|
||||
.Lcopy96:
|
||||
stp A_l, A_h, [dstin]
|
||||
stp B_l, B_h, [dstin, 16]
|
||||
stp E_l, E_h, [dstin, 32]
|
||||
stp F_l, F_h, [dstin, 48]
|
||||
stp C_l, C_h, [dstend, -32]
|
||||
stp D_l, D_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
/* Copy more than 128 bytes. */
|
||||
.Lcopy_long:
|
||||
|
||||
/* Copy 16 bytes and then align dst to 16-byte alignment. */
|
||||
|
||||
ldp D_l, D_h, [src]
|
||||
and tmp1, dstin, 15
|
||||
bic dst, dstin, 15
|
||||
sub src, src, tmp1
|
||||
add count, count, tmp1 /* Count is now 16 too large. */
|
||||
ldp A_l, A_h, [src, 16]
|
||||
stp D_l, D_h, [dstin]
|
||||
ldp B_l, B_h, [src, 32]
|
||||
ldp C_l, C_h, [src, 48]
|
||||
ldp D_l, D_h, [src, 64]!
|
||||
subs count, count, 128 + 16 /* Test and readjust count. */
|
||||
b.ls .Lcopy64_from_end
|
||||
|
||||
.Lloop64:
|
||||
stp A_l, A_h, [dst, 16]
|
||||
ldp A_l, A_h, [src, 16]
|
||||
stp B_l, B_h, [dst, 32]
|
||||
ldp B_l, B_h, [src, 32]
|
||||
stp C_l, C_h, [dst, 48]
|
||||
ldp C_l, C_h, [src, 48]
|
||||
stp D_l, D_h, [dst, 64]!
|
||||
ldp D_l, D_h, [src, 64]!
|
||||
subs count, count, 64
|
||||
b.hi .Lloop64
|
||||
|
||||
/* Write the last iteration and copy 64 bytes from the end. */
|
||||
.Lcopy64_from_end:
|
||||
ldp E_l, E_h, [srcend, -64]
|
||||
stp A_l, A_h, [dst, 16]
|
||||
ldp A_l, A_h, [srcend, -48]
|
||||
stp B_l, B_h, [dst, 32]
|
||||
ldp B_l, B_h, [srcend, -32]
|
||||
stp C_l, C_h, [dst, 48]
|
||||
ldp C_l, C_h, [srcend, -16]
|
||||
stp D_l, D_h, [dst, 64]
|
||||
stp E_l, E_h, [dstend, -64]
|
||||
stp A_l, A_h, [dstend, -48]
|
||||
stp B_l, B_h, [dstend, -32]
|
||||
stp C_l, C_h, [dstend, -16]
|
||||
ret
|
||||
|
||||
.size memcpy,.-memcpy
|
||||
@ -1,479 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2008 The Android Open Source Project
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Optimized memcpy() for ARM.
|
||||
*
|
||||
* note that memcpy() always returns the destination pointer,
|
||||
* so we have to preserve R0.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This file has been modified from the original for use in musl libc.
|
||||
* The main changes are: addition of .type memcpy,%function to make the
|
||||
* code safely callable from thumb mode, adjusting the return
|
||||
* instructions to be compatible with pre-thumb ARM cpus, removal of
|
||||
* prefetch code that is not compatible with older cpus and support for
|
||||
* building as thumb 2 and big-endian.
|
||||
*/
|
||||
|
||||
.syntax unified
|
||||
|
||||
.global memcpy
|
||||
.type memcpy,%function
|
||||
memcpy:
|
||||
/* The stack must always be 64-bits aligned to be compliant with the
|
||||
* ARM ABI. Since we have to save R0, we might as well save R4
|
||||
* which we can use for better pipelining of the reads below
|
||||
*/
|
||||
.fnstart
|
||||
.save {r0, r4, lr}
|
||||
stmfd sp!, {r0, r4, lr}
|
||||
/* Making room for r5-r11 which will be spilled later */
|
||||
.pad #28
|
||||
sub sp, sp, #28
|
||||
|
||||
/* it simplifies things to take care of len<4 early */
|
||||
cmp r2, #4
|
||||
blo copy_last_3_and_return
|
||||
|
||||
/* compute the offset to align the source
|
||||
* offset = (4-(src&3))&3 = -src & 3
|
||||
*/
|
||||
rsb r3, r1, #0
|
||||
ands r3, r3, #3
|
||||
beq src_aligned
|
||||
|
||||
/* align source to 32 bits. We need to insert 2 instructions between
|
||||
* a ldr[b|h] and str[b|h] because byte and half-word instructions
|
||||
* stall 2 cycles.
|
||||
*/
|
||||
movs r12, r3, lsl #31
|
||||
sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
|
||||
ldrbmi r3, [r1], #1
|
||||
ldrbcs r4, [r1], #1
|
||||
ldrbcs r12,[r1], #1
|
||||
strbmi r3, [r0], #1
|
||||
strbcs r4, [r0], #1
|
||||
strbcs r12,[r0], #1
|
||||
|
||||
src_aligned:
|
||||
|
||||
/* see if src and dst are aligned together (congruent) */
|
||||
eor r12, r0, r1
|
||||
tst r12, #3
|
||||
bne non_congruent
|
||||
|
||||
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
||||
* frame. Don't update sp.
|
||||
*/
|
||||
stmea sp, {r5-r11}
|
||||
|
||||
/* align the destination to a cache-line */
|
||||
rsb r3, r0, #0
|
||||
ands r3, r3, #0x1C
|
||||
beq congruent_aligned32
|
||||
cmp r3, r2
|
||||
andhi r3, r2, #0x1C
|
||||
|
||||
/* conditionnaly copies 0 to 7 words (length in r3) */
|
||||
movs r12, r3, lsl #28
|
||||
ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
|
||||
ldmmi r1!, {r8, r9} /* 8 bytes */
|
||||
stmcs r0!, {r4, r5, r6, r7}
|
||||
stmmi r0!, {r8, r9}
|
||||
tst r3, #0x4
|
||||
ldrne r10,[r1], #4 /* 4 bytes */
|
||||
strne r10,[r0], #4
|
||||
sub r2, r2, r3
|
||||
|
||||
congruent_aligned32:
|
||||
/*
|
||||
* here source is aligned to 32 bytes.
|
||||
*/
|
||||
|
||||
cached_aligned32:
|
||||
subs r2, r2, #32
|
||||
blo less_than_32_left
|
||||
|
||||
/*
|
||||
* We preload a cache-line up to 64 bytes ahead. On the 926, this will
|
||||
* stall only until the requested world is fetched, but the linefill
|
||||
* continues in the the background.
|
||||
* While the linefill is going, we write our previous cache-line
|
||||
* into the write-buffer (which should have some free space).
|
||||
* When the linefill is done, the writebuffer will
|
||||
* start dumping its content into memory
|
||||
*
|
||||
* While all this is going, we then load a full cache line into
|
||||
* 8 registers, this cache line should be in the cache by now
|
||||
* (or partly in the cache).
|
||||
*
|
||||
* This code should work well regardless of the source/dest alignment.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Align the preload register to a cache-line because the cpu does
|
||||
* "critical word first" (the first word requested is loaded first).
|
||||
*/
|
||||
@ bic r12, r1, #0x1F
|
||||
@ add r12, r12, #64
|
||||
|
||||
1: ldmia r1!, { r4-r11 }
|
||||
subs r2, r2, #32
|
||||
|
||||
/*
|
||||
* NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
|
||||
* for ARM9 preload will not be safely guarded by the preceding subs.
|
||||
* When it is safely guarded the only possibility to have SIGSEGV here
|
||||
* is because the caller overstates the length.
|
||||
*/
|
||||
@ ldrhi r3, [r12], #32 /* cheap ARM9 preload */
|
||||
stmia r0!, { r4-r11 }
|
||||
bhs 1b
|
||||
|
||||
add r2, r2, #32
|
||||
|
||||
less_than_32_left:
|
||||
/*
|
||||
* less than 32 bytes left at this point (length in r2)
|
||||
*/
|
||||
|
||||
/* skip all this if there is nothing to do, which should
|
||||
* be a common case (if not executed the code below takes
|
||||
* about 16 cycles)
|
||||
*/
|
||||
tst r2, #0x1F
|
||||
beq 1f
|
||||
|
||||
/* conditionnaly copies 0 to 31 bytes */
|
||||
movs r12, r2, lsl #28
|
||||
ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
|
||||
ldmmi r1!, {r8, r9} /* 8 bytes */
|
||||
stmcs r0!, {r4, r5, r6, r7}
|
||||
stmmi r0!, {r8, r9}
|
||||
movs r12, r2, lsl #30
|
||||
ldrcs r3, [r1], #4 /* 4 bytes */
|
||||
ldrhmi r4, [r1], #2 /* 2 bytes */
|
||||
strcs r3, [r0], #4
|
||||
strhmi r4, [r0], #2
|
||||
tst r2, #0x1
|
||||
ldrbne r3, [r1] /* last byte */
|
||||
strbne r3, [r0]
|
||||
|
||||
/* we're done! restore everything and return */
|
||||
1: ldmfd sp!, {r5-r11}
|
||||
ldmfd sp!, {r0, r4, lr}
|
||||
bx lr
|
||||
|
||||
/********************************************************************/
|
||||
|
||||
non_congruent:
|
||||
/*
|
||||
* here source is aligned to 4 bytes
|
||||
* but destination is not.
|
||||
*
|
||||
* in the code below r2 is the number of bytes read
|
||||
* (the number of bytes written is always smaller, because we have
|
||||
* partial words in the shift queue)
|
||||
*/
|
||||
cmp r2, #4
|
||||
blo copy_last_3_and_return
|
||||
|
||||
/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
||||
* frame. Don't update sp.
|
||||
*/
|
||||
stmea sp, {r5-r11}
|
||||
|
||||
/* compute shifts needed to align src to dest */
|
||||
rsb r5, r0, #0
|
||||
and r5, r5, #3 /* r5 = # bytes in partial words */
|
||||
mov r12, r5, lsl #3 /* r12 = right */
|
||||
rsb lr, r12, #32 /* lr = left */
|
||||
|
||||
/* read the first word */
|
||||
ldr r3, [r1], #4
|
||||
sub r2, r2, #4
|
||||
|
||||
/* write a partial word (0 to 3 bytes), such that destination
|
||||
* becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
|
||||
*/
|
||||
movs r5, r5, lsl #31
|
||||
|
||||
#if __ARMEB__
|
||||
movmi r3, r3, ror #24
|
||||
strbmi r3, [r0], #1
|
||||
movcs r3, r3, ror #24
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, ror #24
|
||||
strbcs r3, [r0], #1
|
||||
#else
|
||||
strbmi r3, [r0], #1
|
||||
movmi r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
#endif
|
||||
|
||||
cmp r2, #4
|
||||
blo partial_word_tail
|
||||
|
||||
#if __ARMEB__
|
||||
mov r3, r3, lsr r12
|
||||
mov r3, r3, lsl r12
|
||||
#endif
|
||||
|
||||
/* Align destination to 32 bytes (cache line boundary) */
|
||||
1: tst r0, #0x1c
|
||||
beq 2f
|
||||
ldr r5, [r1], #4
|
||||
sub r2, r2, #4
|
||||
#if __ARMEB__
|
||||
mov r4, r5, lsr lr
|
||||
orr r4, r4, r3
|
||||
mov r3, r5, lsl r12
|
||||
#else
|
||||
mov r4, r5, lsl lr
|
||||
orr r4, r4, r3
|
||||
mov r3, r5, lsr r12
|
||||
#endif
|
||||
str r4, [r0], #4
|
||||
cmp r2, #4
|
||||
bhs 1b
|
||||
blo partial_word_tail
|
||||
|
||||
/* copy 32 bytes at a time */
|
||||
2: subs r2, r2, #32
|
||||
blo less_than_thirtytwo
|
||||
|
||||
/* Use immediate mode for the shifts, because there is an extra cycle
|
||||
* for register shifts, which could account for up to 50% of
|
||||
* performance hit.
|
||||
*/
|
||||
|
||||
cmp r12, #24
|
||||
beq loop24
|
||||
cmp r12, #8
|
||||
beq loop8
|
||||
|
||||
loop16:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
#if __ARMEB__
|
||||
orr r3, r3, r4, lsr #16
|
||||
mov r4, r4, lsl #16
|
||||
orr r4, r4, r5, lsr #16
|
||||
mov r5, r5, lsl #16
|
||||
orr r5, r5, r6, lsr #16
|
||||
mov r6, r6, lsl #16
|
||||
orr r6, r6, r7, lsr #16
|
||||
mov r7, r7, lsl #16
|
||||
orr r7, r7, r8, lsr #16
|
||||
mov r8, r8, lsl #16
|
||||
orr r8, r8, r9, lsr #16
|
||||
mov r9, r9, lsl #16
|
||||
orr r9, r9, r10, lsr #16
|
||||
mov r10, r10, lsl #16
|
||||
orr r10, r10, r11, lsr #16
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsl #16
|
||||
#else
|
||||
orr r3, r3, r4, lsl #16
|
||||
mov r4, r4, lsr #16
|
||||
orr r4, r4, r5, lsl #16
|
||||
mov r5, r5, lsr #16
|
||||
orr r5, r5, r6, lsl #16
|
||||
mov r6, r6, lsr #16
|
||||
orr r6, r6, r7, lsl #16
|
||||
mov r7, r7, lsr #16
|
||||
orr r7, r7, r8, lsl #16
|
||||
mov r8, r8, lsr #16
|
||||
orr r8, r8, r9, lsl #16
|
||||
mov r9, r9, lsr #16
|
||||
orr r9, r9, r10, lsl #16
|
||||
mov r10, r10, lsr #16
|
||||
orr r10, r10, r11, lsl #16
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #16
|
||||
#endif
|
||||
bhs 1b
|
||||
b less_than_thirtytwo
|
||||
|
||||
loop8:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
#if __ARMEB__
|
||||
orr r3, r3, r4, lsr #24
|
||||
mov r4, r4, lsl #8
|
||||
orr r4, r4, r5, lsr #24
|
||||
mov r5, r5, lsl #8
|
||||
orr r5, r5, r6, lsr #24
|
||||
mov r6, r6, lsl #8
|
||||
orr r6, r6, r7, lsr #24
|
||||
mov r7, r7, lsl #8
|
||||
orr r7, r7, r8, lsr #24
|
||||
mov r8, r8, lsl #8
|
||||
orr r8, r8, r9, lsr #24
|
||||
mov r9, r9, lsl #8
|
||||
orr r9, r9, r10, lsr #24
|
||||
mov r10, r10, lsl #8
|
||||
orr r10, r10, r11, lsr #24
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsl #8
|
||||
#else
|
||||
orr r3, r3, r4, lsl #24
|
||||
mov r4, r4, lsr #8
|
||||
orr r4, r4, r5, lsl #24
|
||||
mov r5, r5, lsr #8
|
||||
orr r5, r5, r6, lsl #24
|
||||
mov r6, r6, lsr #8
|
||||
orr r6, r6, r7, lsl #24
|
||||
mov r7, r7, lsr #8
|
||||
orr r7, r7, r8, lsl #24
|
||||
mov r8, r8, lsr #8
|
||||
orr r8, r8, r9, lsl #24
|
||||
mov r9, r9, lsr #8
|
||||
orr r9, r9, r10, lsl #24
|
||||
mov r10, r10, lsr #8
|
||||
orr r10, r10, r11, lsl #24
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #8
|
||||
#endif
|
||||
bhs 1b
|
||||
b less_than_thirtytwo
|
||||
|
||||
loop24:
|
||||
ldr r12, [r1], #4
|
||||
1: mov r4, r12
|
||||
ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
||||
subs r2, r2, #32
|
||||
ldrhs r12, [r1], #4
|
||||
#if __ARMEB__
|
||||
orr r3, r3, r4, lsr #8
|
||||
mov r4, r4, lsl #24
|
||||
orr r4, r4, r5, lsr #8
|
||||
mov r5, r5, lsl #24
|
||||
orr r5, r5, r6, lsr #8
|
||||
mov r6, r6, lsl #24
|
||||
orr r6, r6, r7, lsr #8
|
||||
mov r7, r7, lsl #24
|
||||
orr r7, r7, r8, lsr #8
|
||||
mov r8, r8, lsl #24
|
||||
orr r8, r8, r9, lsr #8
|
||||
mov r9, r9, lsl #24
|
||||
orr r9, r9, r10, lsr #8
|
||||
mov r10, r10, lsl #24
|
||||
orr r10, r10, r11, lsr #8
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsl #24
|
||||
#else
|
||||
orr r3, r3, r4, lsl #8
|
||||
mov r4, r4, lsr #24
|
||||
orr r4, r4, r5, lsl #8
|
||||
mov r5, r5, lsr #24
|
||||
orr r5, r5, r6, lsl #8
|
||||
mov r6, r6, lsr #24
|
||||
orr r6, r6, r7, lsl #8
|
||||
mov r7, r7, lsr #24
|
||||
orr r7, r7, r8, lsl #8
|
||||
mov r8, r8, lsr #24
|
||||
orr r8, r8, r9, lsl #8
|
||||
mov r9, r9, lsr #24
|
||||
orr r9, r9, r10, lsl #8
|
||||
mov r10, r10, lsr #24
|
||||
orr r10, r10, r11, lsl #8
|
||||
stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
||||
mov r3, r11, lsr #24
|
||||
#endif
|
||||
bhs 1b
|
||||
|
||||
less_than_thirtytwo:
|
||||
/* copy the last 0 to 31 bytes of the source */
|
||||
rsb r12, lr, #32 /* we corrupted r12, recompute it */
|
||||
add r2, r2, #32
|
||||
cmp r2, #4
|
||||
blo partial_word_tail
|
||||
|
||||
1: ldr r5, [r1], #4
|
||||
sub r2, r2, #4
|
||||
#if __ARMEB__
|
||||
mov r4, r5, lsr lr
|
||||
orr r4, r4, r3
|
||||
mov r3, r5, lsl r12
|
||||
#else
|
||||
mov r4, r5, lsl lr
|
||||
orr r4, r4, r3
|
||||
mov r3, r5, lsr r12
|
||||
#endif
|
||||
str r4, [r0], #4
|
||||
cmp r2, #4
|
||||
bhs 1b
|
||||
|
||||
partial_word_tail:
|
||||
/* we have a partial word in the input buffer */
|
||||
movs r5, lr, lsl #(31-3)
|
||||
#if __ARMEB__
|
||||
movmi r3, r3, ror #24
|
||||
strbmi r3, [r0], #1
|
||||
movcs r3, r3, ror #24
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, ror #24
|
||||
strbcs r3, [r0], #1
|
||||
#else
|
||||
strbmi r3, [r0], #1
|
||||
movmi r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
movcs r3, r3, lsr #8
|
||||
strbcs r3, [r0], #1
|
||||
#endif
|
||||
|
||||
/* Refill spilled registers from the stack. Don't update sp. */
|
||||
ldmfd sp, {r5-r11}
|
||||
|
||||
copy_last_3_and_return:
|
||||
movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
|
||||
ldrbmi r2, [r1], #1
|
||||
ldrbcs r3, [r1], #1
|
||||
ldrbcs r12,[r1]
|
||||
strbmi r2, [r0], #1
|
||||
strbcs r3, [r0], #1
|
||||
strbcs r12,[r0]
|
||||
|
||||
/* we're done! restore sp and spilled registers and return */
|
||||
add sp, sp, #28
|
||||
ldmfd sp!, {r0, r4, lr}
|
||||
bx lr
|
||||
|
||||
128
lib/libc/wasi/libc-top-half/musl/src/string/memcpy.c
vendored
128
lib/libc/wasi/libc-top-half/musl/src/string/memcpy.c
vendored
@ -1,128 +0,0 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <endian.h>
|
||||
|
||||
void *memcpy(void *restrict dest, const void *restrict src, size_t n)
|
||||
{
|
||||
#if defined(__wasm_bulk_memory__)
|
||||
if (n > BULK_MEMORY_THRESHOLD)
|
||||
return __builtin_memcpy(dest, src, n);
|
||||
#endif
|
||||
unsigned char *d = dest;
|
||||
const unsigned char *s = src;
|
||||
|
||||
#ifdef __GNUC__
|
||||
|
||||
#if __BYTE_ORDER == __LITTLE_ENDIAN
|
||||
#define LS >>
|
||||
#define RS <<
|
||||
#else
|
||||
#define LS <<
|
||||
#define RS >>
|
||||
#endif
|
||||
|
||||
typedef uint32_t __attribute__((__may_alias__)) u32;
|
||||
uint32_t w, x;
|
||||
|
||||
for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++;
|
||||
|
||||
if ((uintptr_t)d % 4 == 0) {
|
||||
for (; n>=16; s+=16, d+=16, n-=16) {
|
||||
*(u32 *)(d+0) = *(u32 *)(s+0);
|
||||
*(u32 *)(d+4) = *(u32 *)(s+4);
|
||||
*(u32 *)(d+8) = *(u32 *)(s+8);
|
||||
*(u32 *)(d+12) = *(u32 *)(s+12);
|
||||
}
|
||||
if (n&8) {
|
||||
*(u32 *)(d+0) = *(u32 *)(s+0);
|
||||
*(u32 *)(d+4) = *(u32 *)(s+4);
|
||||
d += 8; s += 8;
|
||||
}
|
||||
if (n&4) {
|
||||
*(u32 *)(d+0) = *(u32 *)(s+0);
|
||||
d += 4; s += 4;
|
||||
}
|
||||
if (n&2) {
|
||||
*d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&1) {
|
||||
*d = *s;
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
if (n >= 32) switch ((uintptr_t)d % 4) {
|
||||
case 1:
|
||||
w = *(u32 *)s;
|
||||
*d++ = *s++;
|
||||
*d++ = *s++;
|
||||
*d++ = *s++;
|
||||
n -= 3;
|
||||
for (; n>=17; s+=16, d+=16, n-=16) {
|
||||
x = *(u32 *)(s+1);
|
||||
*(u32 *)(d+0) = (w LS 24) | (x RS 8);
|
||||
w = *(u32 *)(s+5);
|
||||
*(u32 *)(d+4) = (x LS 24) | (w RS 8);
|
||||
x = *(u32 *)(s+9);
|
||||
*(u32 *)(d+8) = (w LS 24) | (x RS 8);
|
||||
w = *(u32 *)(s+13);
|
||||
*(u32 *)(d+12) = (x LS 24) | (w RS 8);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
w = *(u32 *)s;
|
||||
*d++ = *s++;
|
||||
*d++ = *s++;
|
||||
n -= 2;
|
||||
for (; n>=18; s+=16, d+=16, n-=16) {
|
||||
x = *(u32 *)(s+2);
|
||||
*(u32 *)(d+0) = (w LS 16) | (x RS 16);
|
||||
w = *(u32 *)(s+6);
|
||||
*(u32 *)(d+4) = (x LS 16) | (w RS 16);
|
||||
x = *(u32 *)(s+10);
|
||||
*(u32 *)(d+8) = (w LS 16) | (x RS 16);
|
||||
w = *(u32 *)(s+14);
|
||||
*(u32 *)(d+12) = (x LS 16) | (w RS 16);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
w = *(u32 *)s;
|
||||
*d++ = *s++;
|
||||
n -= 1;
|
||||
for (; n>=19; s+=16, d+=16, n-=16) {
|
||||
x = *(u32 *)(s+3);
|
||||
*(u32 *)(d+0) = (w LS 8) | (x RS 24);
|
||||
w = *(u32 *)(s+7);
|
||||
*(u32 *)(d+4) = (x LS 8) | (w RS 24);
|
||||
x = *(u32 *)(s+11);
|
||||
*(u32 *)(d+8) = (w LS 8) | (x RS 24);
|
||||
w = *(u32 *)(s+15);
|
||||
*(u32 *)(d+12) = (x LS 8) | (w RS 24);
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (n&16) {
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&8) {
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&4) {
|
||||
*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&2) {
|
||||
*d++ = *s++; *d++ = *s++;
|
||||
}
|
||||
if (n&1) {
|
||||
*d = *s;
|
||||
}
|
||||
return dest;
|
||||
#endif
|
||||
|
||||
for (; n; n--) *d++ = *s++;
|
||||
return dest;
|
||||
}
|
||||
@ -1,46 +0,0 @@
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __GNUC__
|
||||
typedef __attribute__((__may_alias__)) size_t WT;
|
||||
#define WS (sizeof(WT))
|
||||
#endif
|
||||
|
||||
void *memmove(void *dest, const void *src, size_t n)
|
||||
{
|
||||
#if defined(__wasm_bulk_memory__)
|
||||
if (n > BULK_MEMORY_THRESHOLD)
|
||||
return __builtin_memmove(dest, src, n);
|
||||
#endif
|
||||
char *d = dest;
|
||||
const char *s = src;
|
||||
|
||||
if (d==s) return d;
|
||||
if ((uintptr_t)s-(uintptr_t)d-n <= -2*n) return memcpy(d, s, n);
|
||||
|
||||
if (d<s) {
|
||||
#ifdef __GNUC__
|
||||
if ((uintptr_t)s % WS == (uintptr_t)d % WS) {
|
||||
while ((uintptr_t)d % WS) {
|
||||
if (!n--) return dest;
|
||||
*d++ = *s++;
|
||||
}
|
||||
for (; n>=WS; n-=WS, d+=WS, s+=WS) *(WT *)d = *(WT *)s;
|
||||
}
|
||||
#endif
|
||||
for (; n; n--) *d++ = *s++;
|
||||
} else {
|
||||
#ifdef __GNUC__
|
||||
if ((uintptr_t)s % WS == (uintptr_t)d % WS) {
|
||||
while ((uintptr_t)(d+n) % WS) {
|
||||
if (!n--) return dest;
|
||||
d[n] = s[n];
|
||||
}
|
||||
while (n>=WS) n-=WS, *(WT *)(d+n) = *(WT *)(s+n);
|
||||
}
|
||||
#endif
|
||||
while (n) n--, d[n] = s[n];
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
@ -1899,25 +1899,18 @@ const src_files = [_][]const u8{
|
||||
"musl/src/stdlib/strtol.c",
|
||||
"musl/src/stdlib/wcstod.c",
|
||||
"musl/src/stdlib/wcstol.c",
|
||||
"musl/src/string/aarch64/memcpy.S",
|
||||
"musl/src/string/aarch64/memset.S",
|
||||
"musl/src/string/arm/__aeabi_memcpy.s",
|
||||
"musl/src/string/arm/__aeabi_memset.s",
|
||||
"musl/src/string/arm/memcpy.S",
|
||||
"musl/src/string/bcmp.c",
|
||||
"musl/src/string/bcopy.c",
|
||||
"musl/src/string/bzero.c",
|
||||
"musl/src/string/explicit_bzero.c",
|
||||
"musl/src/string/i386/memcpy.s",
|
||||
"musl/src/string/i386/memmove.s",
|
||||
"musl/src/string/i386/memset.s",
|
||||
"musl/src/string/index.c",
|
||||
"musl/src/string/memccpy.c",
|
||||
"musl/src/string/memchr.c",
|
||||
"musl/src/string/memcmp.c",
|
||||
"musl/src/string/memcpy.c",
|
||||
"musl/src/string/memmem.c",
|
||||
"musl/src/string/memmove.c",
|
||||
"musl/src/string/mempcpy.c",
|
||||
"musl/src/string/memrchr.c",
|
||||
"musl/src/string/memset.c",
|
||||
@ -1981,8 +1974,6 @@ const src_files = [_][]const u8{
|
||||
"musl/src/string/wmemcpy.c",
|
||||
"musl/src/string/wmemmove.c",
|
||||
"musl/src/string/wmemset.c",
|
||||
"musl/src/string/x86_64/memcpy.s",
|
||||
"musl/src/string/x86_64/memmove.s",
|
||||
"musl/src/string/x86_64/memset.s",
|
||||
"musl/src/temp/mkdtemp.c",
|
||||
"musl/src/temp/mkostemp.c",
|
||||
|
||||
@ -694,9 +694,7 @@ const libc_top_half_src_files = [_][]const u8{
|
||||
"wasi/libc-top-half/musl/src/string/memccpy.c",
|
||||
"wasi/libc-top-half/musl/src/string/memchr.c",
|
||||
"wasi/libc-top-half/musl/src/string/memcmp.c",
|
||||
"wasi/libc-top-half/musl/src/string/memcpy.c",
|
||||
"wasi/libc-top-half/musl/src/string/memmem.c",
|
||||
"wasi/libc-top-half/musl/src/string/memmove.c",
|
||||
"wasi/libc-top-half/musl/src/string/mempcpy.c",
|
||||
"wasi/libc-top-half/musl/src/string/memrchr.c",
|
||||
"wasi/libc-top-half/musl/src/string/memset.c",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user