[444] | 1 | /* |
---|
| 2 | * Copyright (c) 2014 ARM Ltd |
---|
| 3 | * All rights reserved. |
---|
| 4 | * |
---|
| 5 | * Redistribution and use in source and binary forms, with or without |
---|
| 6 | * modification, are permitted provided that the following conditions |
---|
| 7 | * are met: |
---|
| 8 | * 1. Redistributions of source code must retain the above copyright |
---|
| 9 | * notice, this list of conditions and the following disclaimer. |
---|
| 10 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
| 11 | * notice, this list of conditions and the following disclaimer in the |
---|
| 12 | * documentation and/or other materials provided with the distribution. |
---|
| 13 | * 3. The name of the company may not be used to endorse or promote |
---|
| 14 | * products derived from this software without specific prior written |
---|
| 15 | * permission. |
---|
| 16 | * |
---|
| 17 | * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED |
---|
| 18 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
---|
| 19 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
---|
| 20 | * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
| 21 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED |
---|
| 22 | * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
---|
| 23 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
---|
| 24 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
---|
| 25 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
---|
| 26 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
| 27 | */ |
---|
| 28 | |
---|
| 29 | #include "acle-compat.h" |
---|
| 30 | |
---|
| 31 | /* NOTE: This ifdef MUST match the one in aeabi_memcpy.c. */ |
---|
| 32 | #if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \ |
---|
| 33 | (defined (__ARM_NEON__) || !defined (__SOFTFP__)) |
---|
| 34 | |
---|
| 35 | .syntax unified |
---|
| 36 | .global __aeabi_memcpy |
---|
| 37 | .type __aeabi_memcpy, %function |
---|
| 38 | __aeabi_memcpy: |
---|
| 39 | /* Assumes that n >= 0, and dst, src are valid pointers. |
---|
| 40 | If there is at least 8 bytes to copy, use LDRD/STRD. |
---|
| 41 | If src and dst are misaligned with different offsets, |
---|
| 42 | first copy byte by byte until dst is aligned, |
---|
| 43 | and then copy using LDRD/STRD and shift if needed. |
---|
| 44 | When less than 8 left, copy a word and then byte by byte. */ |
---|
| 45 | |
---|
| 46 | /* Save registers (r0 holds the return value): |
---|
| 47 | optimized push {r0, r4, r5, lr}. |
---|
| 48 | To try and improve performance, stack layout changed, |
---|
| 49 | i.e., not keeping the stack looking like users expect |
---|
| 50 | (highest numbered register at highest address). */ |
---|
| 51 | push {r0, lr} |
---|
| 52 | strd r4, r5, [sp, #-8]! |
---|
| 53 | |
---|
| 54 | /* Get copying of tiny blocks out of the way first. */ |
---|
| 55 | /* Is there at least 4 bytes to copy? */ |
---|
| 56 | subs r2, r2, #4 |
---|
| 57 | blt copy_less_than_4 /* If n < 4. */ |
---|
| 58 | |
---|
| 59 | /* Check word alignment. */ |
---|
| 60 | ands ip, r0, #3 /* ip = last 2 bits of dst. */ |
---|
| 61 | bne dst_not_word_aligned /* If dst is not word-aligned. */ |
---|
| 62 | |
---|
| 63 | /* Get here if dst is word-aligned. */ |
---|
| 64 | ands ip, r1, #3 /* ip = last 2 bits of src. */ |
---|
| 65 | bne src_not_word_aligned /* If src is not word-aligned. */ |
---|
| 66 | word_aligned: |
---|
| 67 | /* Get here if source and dst both are word-aligned. |
---|
| 68 | The number of bytes remaining to copy is r2+4. */ |
---|
| 69 | |
---|
| 70 | /* Is there is at least 64 bytes to copy? */ |
---|
| 71 | subs r2, r2, #60 |
---|
| 72 | blt copy_less_than_64 /* If r2 + 4 < 64. */ |
---|
| 73 | |
---|
| 74 | /* First, align the destination buffer to 8-bytes, |
---|
| 75 | to make sure double loads and stores don't cross cache line boundary, |
---|
| 76 | as they are then more expensive even if the data is in the cache |
---|
| 77 | (require two load/store issue cycles instead of one). |
---|
| 78 | If only one of the buffers is not 8-bytes aligned, |
---|
| 79 | then it's more important to align dst than src, |
---|
| 80 | because there is more penalty for stores |
---|
| 81 | than loads that cross cacheline boundary. |
---|
| 82 | This check and realignment are only worth doing |
---|
| 83 | if there is a lot to copy. */ |
---|
| 84 | |
---|
| 85 | /* Get here if dst is word aligned, |
---|
| 86 | i.e., the 2 least significant bits are 0. |
---|
| 87 | If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), |
---|
| 88 | then copy 1 word (4 bytes). */ |
---|
| 89 | ands r3, r0, #4 |
---|
| 90 | beq two_word_aligned /* If dst already two-word aligned. */ |
---|
| 91 | ldr r3, [r1], #4 |
---|
| 92 | str r3, [r0], #4 |
---|
| 93 | subs r2, r2, #4 |
---|
| 94 | blt copy_less_than_64 |
---|
| 95 | |
---|
| 96 | two_word_aligned: |
---|
| 97 | /* TODO: Align to cacheline (useful for PLD optimization). */ |
---|
| 98 | |
---|
| 99 | /* Every loop iteration copies 64 bytes. */ |
---|
| 100 | 1: |
---|
| 101 | .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 |
---|
| 102 | ldrd r4, r5, [r1, \offset] |
---|
| 103 | strd r4, r5, [r0, \offset] |
---|
| 104 | .endr |
---|
| 105 | |
---|
| 106 | add r0, r0, #64 |
---|
| 107 | add r1, r1, #64 |
---|
| 108 | subs r2, r2, #64 |
---|
| 109 | bge 1b /* If there is more to copy. */ |
---|
| 110 | |
---|
| 111 | copy_less_than_64: |
---|
| 112 | |
---|
| 113 | /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. |
---|
| 114 | Restore the count if there is more than 7 bytes to copy. */ |
---|
| 115 | adds r2, r2, #56 |
---|
| 116 | blt copy_less_than_8 |
---|
| 117 | |
---|
| 118 | /* Copy 8 bytes at a time. */ |
---|
| 119 | 2: |
---|
| 120 | ldrd r4, r5, [r1], #8 |
---|
| 121 | strd r4, r5, [r0], #8 |
---|
| 122 | subs r2, r2, #8 |
---|
| 123 | bge 2b /* If there is more to copy. */ |
---|
| 124 | |
---|
| 125 | copy_less_than_8: |
---|
| 126 | |
---|
| 127 | /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. |
---|
| 128 | Check if there is more to copy. */ |
---|
| 129 | cmn r2, #8 |
---|
| 130 | beq return /* If r2 + 8 == 0. */ |
---|
| 131 | |
---|
| 132 | /* Restore the count if there is more than 3 bytes to copy. */ |
---|
| 133 | adds r2, r2, #4 |
---|
| 134 | blt copy_less_than_4 |
---|
| 135 | |
---|
| 136 | /* Copy 4 bytes. */ |
---|
| 137 | ldr r3, [r1], #4 |
---|
| 138 | str r3, [r0], #4 |
---|
| 139 | |
---|
| 140 | copy_less_than_4: |
---|
| 141 | /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */ |
---|
| 142 | |
---|
| 143 | /* Restore the count, check if there is more to copy. */ |
---|
| 144 | adds r2, r2, #4 |
---|
| 145 | beq return /* If r2 == 0. */ |
---|
| 146 | |
---|
| 147 | /* Get here with r2 is in {1,2,3}={01,10,11}. */ |
---|
| 148 | /* Logical shift left r2, insert 0s, update flags. */ |
---|
| 149 | lsls r2, r2, #31 |
---|
| 150 | |
---|
| 151 | /* Copy byte by byte. |
---|
| 152 | Condition ne means the last bit of r2 is 0. |
---|
| 153 | Condition cs means the second to last bit of r2 is set, |
---|
| 154 | i.e., r2 is 1 or 3. */ |
---|
| 155 | itt ne |
---|
| 156 | ldrbne r3, [r1], #1 |
---|
| 157 | strbne r3, [r0], #1 |
---|
| 158 | |
---|
| 159 | itttt cs |
---|
| 160 | ldrbcs r4, [r1], #1 |
---|
| 161 | ldrbcs r5, [r1] |
---|
| 162 | strbcs r4, [r0], #1 |
---|
| 163 | strbcs r5, [r0] |
---|
| 164 | |
---|
| 165 | return: |
---|
| 166 | /* Restore registers: optimized pop {r0, r4, r5, pc} */ |
---|
| 167 | ldrd r4, r5, [sp], #8 |
---|
| 168 | pop {r0, pc} /* This is the only return point of memcpy. */ |
---|
| 169 | |
---|
| 170 | dst_not_word_aligned: |
---|
| 171 | |
---|
| 172 | /* Get here when dst is not aligned and ip has the last 2 bits of dst, |
---|
| 173 | i.e., ip is the offset of dst from word. |
---|
| 174 | The number of bytes that remains to copy is r2 + 4, |
---|
| 175 | i.e., there are at least 4 bytes to copy. |
---|
| 176 | Write a partial word (0 to 3 bytes), such that dst becomes |
---|
| 177 | word-aligned. */ |
---|
| 178 | |
---|
| 179 | /* If dst is at ip bytes offset from a word (with 0 < ip < 4), |
---|
| 180 | then there are (4 - ip) bytes to fill up to align dst to the next |
---|
| 181 | word. */ |
---|
| 182 | rsb ip, ip, #4 /* ip = #4 - ip. */ |
---|
| 183 | cmp ip, #2 |
---|
| 184 | |
---|
| 185 | /* Copy byte by byte with conditionals. */ |
---|
| 186 | itt gt |
---|
| 187 | ldrbgt r3, [r1], #1 |
---|
| 188 | strbgt r3, [r0], #1 |
---|
| 189 | |
---|
| 190 | itt ge |
---|
| 191 | ldrbge r4, [r1], #1 |
---|
| 192 | strbge r4, [r0], #1 |
---|
| 193 | |
---|
| 194 | ldrb lr, [r1], #1 |
---|
| 195 | strb lr, [r0], #1 |
---|
| 196 | |
---|
| 197 | /* Update the count. |
---|
| 198 | ip holds the number of bytes we have just copied. */ |
---|
| 199 | subs r2, r2, ip /* r2 = r2 - ip. */ |
---|
| 200 | blt copy_less_than_4 /* If r2 < ip. */ |
---|
| 201 | |
---|
| 202 | /* Get here if there are more than 4 bytes to copy. |
---|
| 203 | Check if src is aligned. If beforehand src and dst were not word |
---|
| 204 | aligned but congruent (same offset), then now they are both |
---|
| 205 | word-aligned, and we can copy the rest efficiently (without |
---|
| 206 | shifting). */ |
---|
| 207 | ands ip, r1, #3 /* ip = last 2 bits of src. */ |
---|
| 208 | beq word_aligned /* If r1 is word-aligned. */ |
---|
| 209 | |
---|
| 210 | src_not_word_aligned: |
---|
| 211 | /* Get here when src is not word-aligned, but dst is word-aligned. |
---|
| 212 | The number of bytes that remains to copy is r2+4. */ |
---|
| 213 | |
---|
| 214 | /* Copy word by word using LDR when alignment can be done in hardware, |
---|
| 215 | i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ |
---|
| 216 | subs r2, r2, #60 |
---|
| 217 | blt 8f |
---|
| 218 | |
---|
| 219 | 7: |
---|
| 220 | /* Copy 64 bytes in every loop iteration. */ |
---|
| 221 | .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 |
---|
| 222 | ldr r3, [r1, \offset] |
---|
| 223 | str r3, [r0, \offset] |
---|
| 224 | .endr |
---|
| 225 | |
---|
| 226 | add r0, r0, #64 |
---|
| 227 | add r1, r1, #64 |
---|
| 228 | subs r2, r2, #64 |
---|
| 229 | bge 7b |
---|
| 230 | |
---|
| 231 | 8: |
---|
| 232 | /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. |
---|
| 233 | Check if there is more than 3 bytes to copy. */ |
---|
| 234 | adds r2, r2, #60 |
---|
| 235 | blt copy_less_than_4 |
---|
| 236 | |
---|
| 237 | 9: |
---|
| 238 | /* Get here if there is less than 64 but at least 4 bytes to copy, |
---|
| 239 | where the number of bytes to copy is r2+4. */ |
---|
| 240 | ldr r3, [r1], #4 |
---|
| 241 | str r3, [r0], #4 |
---|
| 242 | subs r2, r2, #4 |
---|
| 243 | bge 9b |
---|
| 244 | |
---|
| 245 | b copy_less_than_4 |
---|
| 246 | |
---|
| 247 | |
---|
| 248 | .syntax unified |
---|
| 249 | .global __aeabi_memcpy4 |
---|
| 250 | .type __aeabi_memcpy4, %function |
---|
| 251 | __aeabi_memcpy4: |
---|
| 252 | /* Assumes that both of its arguments are 4-byte aligned. */ |
---|
| 253 | |
---|
| 254 | push {r0, lr} |
---|
| 255 | strd r4, r5, [sp, #-8]! |
---|
| 256 | |
---|
| 257 | /* Is there at least 4 bytes to copy? */ |
---|
| 258 | subs r2, r2, #4 |
---|
| 259 | blt copy_less_than_4 /* If n < 4. */ |
---|
| 260 | |
---|
| 261 | bl word_aligned |
---|
| 262 | |
---|
| 263 | .syntax unified |
---|
| 264 | .global __aeabi_memcpy8 |
---|
| 265 | .type __aeabi_memcpy8, %function |
---|
| 266 | __aeabi_memcpy8: |
---|
| 267 | /* Assumes that both of its arguments are 8-byte aligned. */ |
---|
| 268 | |
---|
| 269 | push {r0, lr} |
---|
| 270 | strd r4, r5, [sp, #-8]! |
---|
| 271 | |
---|
| 272 | /* Is there at least 4 bytes to copy? */ |
---|
| 273 | subs r2, r2, #4 |
---|
| 274 | blt copy_less_than_4 /* If n < 4. */ |
---|
| 275 | |
---|
| 276 | /* Is there at least 8 bytes to copy? */ |
---|
| 277 | subs r2, r2, #4 |
---|
| 278 | blt copy_less_than_8 /* If n < 8. */ |
---|
| 279 | |
---|
| 280 | /* Is there at least 64 bytes to copy? */ |
---|
| 281 | subs r2, r2, #56 |
---|
| 282 | blt copy_less_than_64 /* if n + 8 < 64. */ |
---|
| 283 | |
---|
| 284 | bl two_word_aligned |
---|
| 285 | |
---|
| 286 | #endif |
---|