[444] | 1 | /* Copyright (c) 2013, Linaro Limited |
---|
| 2 | All rights reserved. |
---|
| 3 | |
---|
| 4 | Redistribution and use in source and binary forms, with or without |
---|
| 5 | modification, are permitted provided that the following conditions |
---|
| 6 | are met: |
---|
| 7 | |
---|
| 8 | * Redistributions of source code must retain the above copyright |
---|
| 9 | notice, this list of conditions and the following disclaimer. |
---|
| 10 | |
---|
| 11 | * Redistributions in binary form must reproduce the above copyright |
---|
| 12 | notice, this list of conditions and the following disclaimer in the |
---|
| 13 | documentation and/or other materials provided with the distribution. |
---|
| 14 | |
---|
| 15 | * Neither the name of Linaro Limited nor the names of its |
---|
| 16 | contributors may be used to endorse or promote products derived |
---|
| 17 | from this software without specific prior written permission. |
---|
| 18 | |
---|
| 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
---|
| 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
---|
| 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
---|
| 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
---|
| 23 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
| 24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
---|
| 25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
---|
| 26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
---|
| 27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
---|
| 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
---|
| 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
| 30 | |
---|
| 31 | This memcpy routine is optimised for Cortex-A15 cores and takes advantage |
---|
| 32 | of VFP or NEON when built with the appropriate flags. |
---|
| 33 | |
---|
| 34 | Assumptions: |
---|
| 35 | |
---|
| 36 | ARMv6 (ARMv7-a if using Neon) |
---|
| 37 | ARM state |
---|
| 38 | Unaligned accesses |
---|
| 39 | LDRD/STRD support unaligned word accesses |
---|
| 40 | |
---|
| 41 | If compiled with GCC, this file should be enclosed within following |
---|
| 42 | pre-processing check: |
---|
| 43 | if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) |
---|
| 44 | |
---|
| 45 | */ |
---|
| 46 | .syntax unified |
---|
| 47 | /* This implementation requires ARM state. */ |
---|
| 48 | .arm |
---|
| 49 | |
---|
| 50 | #ifdef __ARM_NEON__ |
---|
| 51 | |
---|
| 52 | .fpu neon |
---|
| 53 | .arch armv7-a |
---|
| 54 | # define FRAME_SIZE 4 |
---|
| 55 | # define USE_VFP |
---|
| 56 | # define USE_NEON |
---|
| 57 | |
---|
| 58 | #elif !defined (__SOFTFP__) |
---|
| 59 | |
---|
| 60 | .arch armv6 |
---|
| 61 | .fpu vfpv2 |
---|
| 62 | # define FRAME_SIZE 32 |
---|
| 63 | # define USE_VFP |
---|
| 64 | |
---|
| 65 | #else |
---|
| 66 | .arch armv6 |
---|
| 67 | # define FRAME_SIZE 32 |
---|
| 68 | |
---|
| 69 | #endif |
---|
| 70 | |
---|
| 71 | /* Old versions of GAS incorrectly implement the NEON align semantics. */ |
---|
| 72 | #ifdef BROKEN_ASM_NEON_ALIGN |
---|
| 73 | #define ALIGN(addr, align) addr,:align |
---|
| 74 | #else |
---|
| 75 | #define ALIGN(addr, align) addr:align |
---|
| 76 | #endif |
---|
| 77 | |
---|
| 78 | #define PC_OFFSET 8 /* PC pipeline compensation. */ |
---|
| 79 | #define INSN_SIZE 4 |
---|
| 80 | |
---|
| 81 | /* Call parameters. */ |
---|
| 82 | #define dstin r0 |
---|
| 83 | #define src r1 |
---|
| 84 | #define count r2 |
---|
| 85 | |
---|
| 86 | /* Locals. */ |
---|
| 87 | #define tmp1 r3 |
---|
| 88 | #define dst ip |
---|
| 89 | #define tmp2 r10 |
---|
| 90 | |
---|
| 91 | #ifndef USE_NEON |
---|
| 92 | /* For bulk copies using GP registers. */ |
---|
| 93 | #define A_l r2 /* Call-clobbered. */ |
---|
| 94 | #define A_h r3 /* Call-clobbered. */ |
---|
| 95 | #define B_l r4 |
---|
| 96 | #define B_h r5 |
---|
| 97 | #define C_l r6 |
---|
| 98 | #define C_h r7 |
---|
| 99 | #define D_l r8 |
---|
| 100 | #define D_h r9 |
---|
| 101 | #endif |
---|
| 102 | |
---|
| 103 | /* Number of lines ahead to pre-fetch data. If you change this the code |
---|
| 104 | below will need adjustment to compensate. */ |
---|
| 105 | |
---|
| 106 | #define prefetch_lines 5 |
---|
| 107 | |
---|
| 108 | #ifdef USE_VFP |
---|
| 109 | .macro cpy_line_vfp vreg, base |
---|
| 110 | vstr \vreg, [dst, #\base] |
---|
| 111 | vldr \vreg, [src, #\base] |
---|
| 112 | vstr d0, [dst, #\base + 8] |
---|
| 113 | vldr d0, [src, #\base + 8] |
---|
| 114 | vstr d1, [dst, #\base + 16] |
---|
| 115 | vldr d1, [src, #\base + 16] |
---|
| 116 | vstr d2, [dst, #\base + 24] |
---|
| 117 | vldr d2, [src, #\base + 24] |
---|
| 118 | vstr \vreg, [dst, #\base + 32] |
---|
| 119 | vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] |
---|
| 120 | vstr d0, [dst, #\base + 40] |
---|
| 121 | vldr d0, [src, #\base + 40] |
---|
| 122 | vstr d1, [dst, #\base + 48] |
---|
| 123 | vldr d1, [src, #\base + 48] |
---|
| 124 | vstr d2, [dst, #\base + 56] |
---|
| 125 | vldr d2, [src, #\base + 56] |
---|
| 126 | .endm |
---|
| 127 | |
---|
| 128 | .macro cpy_tail_vfp vreg, base |
---|
| 129 | vstr \vreg, [dst, #\base] |
---|
| 130 | vldr \vreg, [src, #\base] |
---|
| 131 | vstr d0, [dst, #\base + 8] |
---|
| 132 | vldr d0, [src, #\base + 8] |
---|
| 133 | vstr d1, [dst, #\base + 16] |
---|
| 134 | vldr d1, [src, #\base + 16] |
---|
| 135 | vstr d2, [dst, #\base + 24] |
---|
| 136 | vldr d2, [src, #\base + 24] |
---|
| 137 | vstr \vreg, [dst, #\base + 32] |
---|
| 138 | vstr d0, [dst, #\base + 40] |
---|
| 139 | vldr d0, [src, #\base + 40] |
---|
| 140 | vstr d1, [dst, #\base + 48] |
---|
| 141 | vldr d1, [src, #\base + 48] |
---|
| 142 | vstr d2, [dst, #\base + 56] |
---|
| 143 | vldr d2, [src, #\base + 56] |
---|
| 144 | .endm |
---|
| 145 | #endif |
---|
| 146 | |
---|
| 147 | .macro def_fn f p2align=0 |
---|
| 148 | .text |
---|
| 149 | .p2align \p2align |
---|
| 150 | .global \f |
---|
| 151 | .type \f, %function |
---|
| 152 | \f: |
---|
| 153 | .endm |
---|
| 154 | |
---|
| 155 | def_fn memcpy p2align=6 |
---|
| 156 | |
---|
| 157 | mov dst, dstin /* Preserve dstin, we need to return it. */ |
---|
| 158 | cmp count, #64 |
---|
| 159 | bge .Lcpy_not_short |
---|
| 160 | /* Deal with small copies quickly by dropping straight into the |
---|
| 161 | exit block. */ |
---|
| 162 | |
---|
| 163 | .Ltail63unaligned: |
---|
| 164 | #ifdef USE_NEON |
---|
| 165 | and tmp1, count, #0x38 |
---|
| 166 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
---|
| 167 | add pc, pc, tmp1 |
---|
| 168 | vld1.8 {d0}, [src]! /* 14 words to go. */ |
---|
| 169 | vst1.8 {d0}, [dst]! |
---|
| 170 | vld1.8 {d0}, [src]! /* 12 words to go. */ |
---|
| 171 | vst1.8 {d0}, [dst]! |
---|
| 172 | vld1.8 {d0}, [src]! /* 10 words to go. */ |
---|
| 173 | vst1.8 {d0}, [dst]! |
---|
| 174 | vld1.8 {d0}, [src]! /* 8 words to go. */ |
---|
| 175 | vst1.8 {d0}, [dst]! |
---|
| 176 | vld1.8 {d0}, [src]! /* 6 words to go. */ |
---|
| 177 | vst1.8 {d0}, [dst]! |
---|
| 178 | vld1.8 {d0}, [src]! /* 4 words to go. */ |
---|
| 179 | vst1.8 {d0}, [dst]! |
---|
| 180 | vld1.8 {d0}, [src]! /* 2 words to go. */ |
---|
| 181 | vst1.8 {d0}, [dst]! |
---|
| 182 | |
---|
| 183 | tst count, #4 |
---|
| 184 | ldrne tmp1, [src], #4 |
---|
| 185 | strne tmp1, [dst], #4 |
---|
| 186 | #else |
---|
| 187 | /* Copy up to 15 full words of data. May not be aligned. */ |
---|
| 188 | /* Cannot use VFP for unaligned data. */ |
---|
| 189 | and tmp1, count, #0x3c |
---|
| 190 | add dst, dst, tmp1 |
---|
| 191 | add src, src, tmp1 |
---|
| 192 | rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) |
---|
| 193 | /* Jump directly into the sequence below at the correct offset. */ |
---|
| 194 | add pc, pc, tmp1, lsl #1 |
---|
| 195 | |
---|
| 196 | ldr tmp1, [src, #-60] /* 15 words to go. */ |
---|
| 197 | str tmp1, [dst, #-60] |
---|
| 198 | |
---|
| 199 | ldr tmp1, [src, #-56] /* 14 words to go. */ |
---|
| 200 | str tmp1, [dst, #-56] |
---|
| 201 | ldr tmp1, [src, #-52] |
---|
| 202 | str tmp1, [dst, #-52] |
---|
| 203 | |
---|
| 204 | ldr tmp1, [src, #-48] /* 12 words to go. */ |
---|
| 205 | str tmp1, [dst, #-48] |
---|
| 206 | ldr tmp1, [src, #-44] |
---|
| 207 | str tmp1, [dst, #-44] |
---|
| 208 | |
---|
| 209 | ldr tmp1, [src, #-40] /* 10 words to go. */ |
---|
| 210 | str tmp1, [dst, #-40] |
---|
| 211 | ldr tmp1, [src, #-36] |
---|
| 212 | str tmp1, [dst, #-36] |
---|
| 213 | |
---|
| 214 | ldr tmp1, [src, #-32] /* 8 words to go. */ |
---|
| 215 | str tmp1, [dst, #-32] |
---|
| 216 | ldr tmp1, [src, #-28] |
---|
| 217 | str tmp1, [dst, #-28] |
---|
| 218 | |
---|
| 219 | ldr tmp1, [src, #-24] /* 6 words to go. */ |
---|
| 220 | str tmp1, [dst, #-24] |
---|
| 221 | ldr tmp1, [src, #-20] |
---|
| 222 | str tmp1, [dst, #-20] |
---|
| 223 | |
---|
| 224 | ldr tmp1, [src, #-16] /* 4 words to go. */ |
---|
| 225 | str tmp1, [dst, #-16] |
---|
| 226 | ldr tmp1, [src, #-12] |
---|
| 227 | str tmp1, [dst, #-12] |
---|
| 228 | |
---|
| 229 | ldr tmp1, [src, #-8] /* 2 words to go. */ |
---|
| 230 | str tmp1, [dst, #-8] |
---|
| 231 | ldr tmp1, [src, #-4] |
---|
| 232 | str tmp1, [dst, #-4] |
---|
| 233 | #endif |
---|
| 234 | |
---|
| 235 | lsls count, count, #31 |
---|
| 236 | ldrhcs tmp1, [src], #2 |
---|
| 237 | ldrbne src, [src] /* Src is dead, use as a scratch. */ |
---|
| 238 | strhcs tmp1, [dst], #2 |
---|
| 239 | strbne src, [dst] |
---|
| 240 | bx lr |
---|
| 241 | |
---|
| 242 | .Lcpy_not_short: |
---|
| 243 | /* At least 64 bytes to copy, but don't know the alignment yet. */ |
---|
| 244 | str tmp2, [sp, #-FRAME_SIZE]! |
---|
| 245 | and tmp2, src, #7 |
---|
| 246 | and tmp1, dst, #7 |
---|
| 247 | cmp tmp1, tmp2 |
---|
| 248 | bne .Lcpy_notaligned |
---|
| 249 | |
---|
| 250 | #ifdef USE_VFP |
---|
| 251 | /* Magic dust alert! Force VFP on Cortex-A9. Experiments show |
---|
| 252 | that the FP pipeline is much better at streaming loads and |
---|
| 253 | stores. This is outside the critical loop. */ |
---|
| 254 | vmov.f32 s0, s0 |
---|
| 255 | #endif |
---|
| 256 | |
---|
| 257 | /* SRC and DST have the same mutual 32-bit alignment, but we may |
---|
| 258 | still need to pre-copy some bytes to get to natural alignment. |
---|
| 259 | We bring DST into full 64-bit alignment. */ |
---|
| 260 | lsls tmp2, dst, #29 |
---|
| 261 | beq 1f |
---|
| 262 | rsbs tmp2, tmp2, #0 |
---|
| 263 | sub count, count, tmp2, lsr #29 |
---|
| 264 | ldrmi tmp1, [src], #4 |
---|
| 265 | strmi tmp1, [dst], #4 |
---|
| 266 | lsls tmp2, tmp2, #2 |
---|
| 267 | ldrhcs tmp1, [src], #2 |
---|
| 268 | ldrbne tmp2, [src], #1 |
---|
| 269 | strhcs tmp1, [dst], #2 |
---|
| 270 | strbne tmp2, [dst], #1 |
---|
| 271 | |
---|
| 272 | 1: |
---|
| 273 | subs tmp2, count, #64 /* Use tmp2 for count. */ |
---|
| 274 | blt .Ltail63aligned |
---|
| 275 | |
---|
| 276 | cmp tmp2, #512 |
---|
| 277 | bge .Lcpy_body_long |
---|
| 278 | |
---|
| 279 | .Lcpy_body_medium: /* Count in tmp2. */ |
---|
| 280 | #ifdef USE_VFP |
---|
| 281 | 1: |
---|
| 282 | vldr d0, [src, #0] |
---|
| 283 | subs tmp2, tmp2, #64 |
---|
| 284 | vldr d1, [src, #8] |
---|
| 285 | vstr d0, [dst, #0] |
---|
| 286 | vldr d0, [src, #16] |
---|
| 287 | vstr d1, [dst, #8] |
---|
| 288 | vldr d1, [src, #24] |
---|
| 289 | vstr d0, [dst, #16] |
---|
| 290 | vldr d0, [src, #32] |
---|
| 291 | vstr d1, [dst, #24] |
---|
| 292 | vldr d1, [src, #40] |
---|
| 293 | vstr d0, [dst, #32] |
---|
| 294 | vldr d0, [src, #48] |
---|
| 295 | vstr d1, [dst, #40] |
---|
| 296 | vldr d1, [src, #56] |
---|
| 297 | vstr d0, [dst, #48] |
---|
| 298 | add src, src, #64 |
---|
| 299 | vstr d1, [dst, #56] |
---|
| 300 | add dst, dst, #64 |
---|
| 301 | bge 1b |
---|
| 302 | tst tmp2, #0x3f |
---|
| 303 | beq .Ldone |
---|
| 304 | |
---|
| 305 | .Ltail63aligned: /* Count in tmp2. */ |
---|
| 306 | and tmp1, tmp2, #0x38 |
---|
| 307 | add dst, dst, tmp1 |
---|
| 308 | add src, src, tmp1 |
---|
| 309 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
---|
| 310 | add pc, pc, tmp1 |
---|
| 311 | |
---|
| 312 | vldr d0, [src, #-56] /* 14 words to go. */ |
---|
| 313 | vstr d0, [dst, #-56] |
---|
| 314 | vldr d0, [src, #-48] /* 12 words to go. */ |
---|
| 315 | vstr d0, [dst, #-48] |
---|
| 316 | vldr d0, [src, #-40] /* 10 words to go. */ |
---|
| 317 | vstr d0, [dst, #-40] |
---|
| 318 | vldr d0, [src, #-32] /* 8 words to go. */ |
---|
| 319 | vstr d0, [dst, #-32] |
---|
| 320 | vldr d0, [src, #-24] /* 6 words to go. */ |
---|
| 321 | vstr d0, [dst, #-24] |
---|
| 322 | vldr d0, [src, #-16] /* 4 words to go. */ |
---|
| 323 | vstr d0, [dst, #-16] |
---|
| 324 | vldr d0, [src, #-8] /* 2 words to go. */ |
---|
| 325 | vstr d0, [dst, #-8] |
---|
| 326 | #else |
---|
| 327 | sub src, src, #8 |
---|
| 328 | sub dst, dst, #8 |
---|
| 329 | 1: |
---|
| 330 | ldrd A_l, A_h, [src, #8] |
---|
| 331 | strd A_l, A_h, [dst, #8] |
---|
| 332 | ldrd A_l, A_h, [src, #16] |
---|
| 333 | strd A_l, A_h, [dst, #16] |
---|
| 334 | ldrd A_l, A_h, [src, #24] |
---|
| 335 | strd A_l, A_h, [dst, #24] |
---|
| 336 | ldrd A_l, A_h, [src, #32] |
---|
| 337 | strd A_l, A_h, [dst, #32] |
---|
| 338 | ldrd A_l, A_h, [src, #40] |
---|
| 339 | strd A_l, A_h, [dst, #40] |
---|
| 340 | ldrd A_l, A_h, [src, #48] |
---|
| 341 | strd A_l, A_h, [dst, #48] |
---|
| 342 | ldrd A_l, A_h, [src, #56] |
---|
| 343 | strd A_l, A_h, [dst, #56] |
---|
| 344 | ldrd A_l, A_h, [src, #64]! |
---|
| 345 | strd A_l, A_h, [dst, #64]! |
---|
| 346 | subs tmp2, tmp2, #64 |
---|
| 347 | bge 1b |
---|
| 348 | tst tmp2, #0x3f |
---|
| 349 | bne 1f |
---|
| 350 | ldr tmp2,[sp], #FRAME_SIZE |
---|
| 351 | bx lr |
---|
| 352 | 1: |
---|
| 353 | add src, src, #8 |
---|
| 354 | add dst, dst, #8 |
---|
| 355 | |
---|
| 356 | .Ltail63aligned: /* Count in tmp2. */ |
---|
| 357 | /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but |
---|
| 358 | we know that the src and dest are 32-bit aligned so we can use |
---|
| 359 | LDRD/STRD to improve efficiency. */ |
---|
| 360 | /* TMP2 is now negative, but we don't care about that. The bottom |
---|
| 361 | six bits still tell us how many bytes are left to copy. */ |
---|
| 362 | |
---|
| 363 | and tmp1, tmp2, #0x38 |
---|
| 364 | add dst, dst, tmp1 |
---|
| 365 | add src, src, tmp1 |
---|
| 366 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
---|
| 367 | add pc, pc, tmp1 |
---|
| 368 | ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ |
---|
| 369 | strd A_l, A_h, [dst, #-56] |
---|
| 370 | ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ |
---|
| 371 | strd A_l, A_h, [dst, #-48] |
---|
| 372 | ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ |
---|
| 373 | strd A_l, A_h, [dst, #-40] |
---|
| 374 | ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ |
---|
| 375 | strd A_l, A_h, [dst, #-32] |
---|
| 376 | ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ |
---|
| 377 | strd A_l, A_h, [dst, #-24] |
---|
| 378 | ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ |
---|
| 379 | strd A_l, A_h, [dst, #-16] |
---|
| 380 | ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ |
---|
| 381 | strd A_l, A_h, [dst, #-8] |
---|
| 382 | |
---|
| 383 | #endif |
---|
| 384 | tst tmp2, #4 |
---|
| 385 | ldrne tmp1, [src], #4 |
---|
| 386 | strne tmp1, [dst], #4 |
---|
| 387 | lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ |
---|
| 388 | ldrhcs tmp1, [src], #2 |
---|
| 389 | ldrbne tmp2, [src] |
---|
| 390 | strhcs tmp1, [dst], #2 |
---|
| 391 | strbne tmp2, [dst] |
---|
| 392 | |
---|
| 393 | .Ldone: |
---|
| 394 | ldr tmp2, [sp], #FRAME_SIZE |
---|
| 395 | bx lr |
---|
| 396 | |
---|
| 397 | .Lcpy_body_long: /* Count in tmp2. */ |
---|
| 398 | |
---|
| 399 | /* Long copy. We know that there's at least (prefetch_lines * 64) |
---|
| 400 | bytes to go. */ |
---|
| 401 | #ifdef USE_VFP |
---|
| 402 | /* Don't use PLD. Instead, read some data in advance of the current |
---|
| 403 | copy position into a register. This should act like a PLD |
---|
| 404 | operation but we won't have to repeat the transfer. */ |
---|
| 405 | |
---|
| 406 | vldr d3, [src, #0] |
---|
| 407 | vldr d4, [src, #64] |
---|
| 408 | vldr d5, [src, #128] |
---|
| 409 | vldr d6, [src, #192] |
---|
| 410 | vldr d7, [src, #256] |
---|
| 411 | |
---|
| 412 | vldr d0, [src, #8] |
---|
| 413 | vldr d1, [src, #16] |
---|
| 414 | vldr d2, [src, #24] |
---|
| 415 | add src, src, #32 |
---|
| 416 | |
---|
| 417 | subs tmp2, tmp2, #prefetch_lines * 64 * 2 |
---|
| 418 | blt 2f |
---|
| 419 | 1: |
---|
| 420 | cpy_line_vfp d3, 0 |
---|
| 421 | cpy_line_vfp d4, 64 |
---|
| 422 | cpy_line_vfp d5, 128 |
---|
| 423 | add dst, dst, #3 * 64 |
---|
| 424 | add src, src, #3 * 64 |
---|
| 425 | cpy_line_vfp d6, 0 |
---|
| 426 | cpy_line_vfp d7, 64 |
---|
| 427 | add dst, dst, #2 * 64 |
---|
| 428 | add src, src, #2 * 64 |
---|
| 429 | subs tmp2, tmp2, #prefetch_lines * 64 |
---|
| 430 | bge 1b |
---|
| 431 | |
---|
| 432 | 2: |
---|
| 433 | cpy_tail_vfp d3, 0 |
---|
| 434 | cpy_tail_vfp d4, 64 |
---|
| 435 | cpy_tail_vfp d5, 128 |
---|
| 436 | add src, src, #3 * 64 |
---|
| 437 | add dst, dst, #3 * 64 |
---|
| 438 | cpy_tail_vfp d6, 0 |
---|
| 439 | vstr d7, [dst, #64] |
---|
| 440 | vldr d7, [src, #64] |
---|
| 441 | vstr d0, [dst, #64 + 8] |
---|
| 442 | vldr d0, [src, #64 + 8] |
---|
| 443 | vstr d1, [dst, #64 + 16] |
---|
| 444 | vldr d1, [src, #64 + 16] |
---|
| 445 | vstr d2, [dst, #64 + 24] |
---|
| 446 | vldr d2, [src, #64 + 24] |
---|
| 447 | vstr d7, [dst, #64 + 32] |
---|
| 448 | add src, src, #96 |
---|
| 449 | vstr d0, [dst, #64 + 40] |
---|
| 450 | vstr d1, [dst, #64 + 48] |
---|
| 451 | vstr d2, [dst, #64 + 56] |
---|
| 452 | add dst, dst, #128 |
---|
| 453 | add tmp2, tmp2, #prefetch_lines * 64 |
---|
| 454 | b .Lcpy_body_medium |
---|
| 455 | #else |
---|
| 456 | /* Long copy. Use an SMS style loop to maximize the I/O |
---|
| 457 | bandwidth of the core. We don't have enough spare registers |
---|
| 458 | to synthesise prefetching, so use PLD operations. */ |
---|
| 459 | /* Pre-bias src and dst. */ |
---|
| 460 | sub src, src, #8 |
---|
| 461 | sub dst, dst, #8 |
---|
| 462 | pld [src, #8] |
---|
| 463 | pld [src, #72] |
---|
| 464 | subs tmp2, tmp2, #64 |
---|
| 465 | pld [src, #136] |
---|
| 466 | ldrd A_l, A_h, [src, #8] |
---|
| 467 | strd B_l, B_h, [sp, #8] |
---|
| 468 | ldrd B_l, B_h, [src, #16] |
---|
| 469 | strd C_l, C_h, [sp, #16] |
---|
| 470 | ldrd C_l, C_h, [src, #24] |
---|
| 471 | strd D_l, D_h, [sp, #24] |
---|
| 472 | pld [src, #200] |
---|
| 473 | ldrd D_l, D_h, [src, #32]! |
---|
| 474 | b 1f |
---|
| 475 | .p2align 6 |
---|
| 476 | 2: |
---|
| 477 | pld [src, #232] |
---|
| 478 | strd A_l, A_h, [dst, #40] |
---|
| 479 | ldrd A_l, A_h, [src, #40] |
---|
| 480 | strd B_l, B_h, [dst, #48] |
---|
| 481 | ldrd B_l, B_h, [src, #48] |
---|
| 482 | strd C_l, C_h, [dst, #56] |
---|
| 483 | ldrd C_l, C_h, [src, #56] |
---|
| 484 | strd D_l, D_h, [dst, #64]! |
---|
| 485 | ldrd D_l, D_h, [src, #64]! |
---|
| 486 | subs tmp2, tmp2, #64 |
---|
| 487 | 1: |
---|
| 488 | strd A_l, A_h, [dst, #8] |
---|
| 489 | ldrd A_l, A_h, [src, #8] |
---|
| 490 | strd B_l, B_h, [dst, #16] |
---|
| 491 | ldrd B_l, B_h, [src, #16] |
---|
| 492 | strd C_l, C_h, [dst, #24] |
---|
| 493 | ldrd C_l, C_h, [src, #24] |
---|
| 494 | strd D_l, D_h, [dst, #32] |
---|
| 495 | ldrd D_l, D_h, [src, #32] |
---|
| 496 | bcs 2b |
---|
| 497 | /* Save the remaining bytes and restore the callee-saved regs. */ |
---|
| 498 | strd A_l, A_h, [dst, #40] |
---|
| 499 | add src, src, #40 |
---|
| 500 | strd B_l, B_h, [dst, #48] |
---|
| 501 | ldrd B_l, B_h, [sp, #8] |
---|
| 502 | strd C_l, C_h, [dst, #56] |
---|
| 503 | ldrd C_l, C_h, [sp, #16] |
---|
| 504 | strd D_l, D_h, [dst, #64] |
---|
| 505 | ldrd D_l, D_h, [sp, #24] |
---|
| 506 | add dst, dst, #72 |
---|
| 507 | tst tmp2, #0x3f |
---|
| 508 | bne .Ltail63aligned |
---|
| 509 | ldr tmp2, [sp], #FRAME_SIZE |
---|
| 510 | bx lr |
---|
| 511 | #endif |
---|
| 512 | |
---|
| 513 | .Lcpy_notaligned: |
---|
| 514 | pld [src] |
---|
| 515 | pld [src, #64] |
---|
| 516 | /* There's at least 64 bytes to copy, but there is no mutual |
---|
| 517 | alignment. */ |
---|
| 518 | /* Bring DST to 64-bit alignment. */ |
---|
| 519 | lsls tmp2, dst, #29 |
---|
| 520 | pld [src, #(2 * 64)] |
---|
| 521 | beq 1f |
---|
| 522 | rsbs tmp2, tmp2, #0 |
---|
| 523 | sub count, count, tmp2, lsr #29 |
---|
| 524 | ldrmi tmp1, [src], #4 |
---|
| 525 | strmi tmp1, [dst], #4 |
---|
| 526 | lsls tmp2, tmp2, #2 |
---|
| 527 | ldrbne tmp1, [src], #1 |
---|
| 528 | ldrhcs tmp2, [src], #2 |
---|
| 529 | strbne tmp1, [dst], #1 |
---|
| 530 | strhcs tmp2, [dst], #2 |
---|
| 531 | 1: |
---|
| 532 | pld [src, #(3 * 64)] |
---|
| 533 | subs count, count, #64 |
---|
| 534 | ldrmi tmp2, [sp], #FRAME_SIZE |
---|
| 535 | bmi .Ltail63unaligned |
---|
| 536 | pld [src, #(4 * 64)] |
---|
| 537 | |
---|
| 538 | #ifdef USE_NEON |
---|
| 539 | vld1.8 {d0-d3}, [src]! |
---|
| 540 | vld1.8 {d4-d7}, [src]! |
---|
| 541 | subs count, count, #64 |
---|
| 542 | bmi 2f |
---|
| 543 | 1: |
---|
| 544 | pld [src, #(4 * 64)] |
---|
| 545 | vst1.8 {d0-d3}, [ALIGN (dst, 64)]! |
---|
| 546 | vld1.8 {d0-d3}, [src]! |
---|
| 547 | vst1.8 {d4-d7}, [ALIGN (dst, 64)]! |
---|
| 548 | vld1.8 {d4-d7}, [src]! |
---|
| 549 | subs count, count, #64 |
---|
| 550 | bpl 1b |
---|
| 551 | 2: |
---|
| 552 | vst1.8 {d0-d3}, [ALIGN (dst, 64)]! |
---|
| 553 | vst1.8 {d4-d7}, [ALIGN (dst, 64)]! |
---|
| 554 | ands count, count, #0x3f |
---|
| 555 | #else |
---|
| 556 | /* Use an SMS style loop to maximize the I/O bandwidth. */ |
---|
| 557 | sub src, src, #4 |
---|
| 558 | sub dst, dst, #8 |
---|
| 559 | subs tmp2, count, #64 /* Use tmp2 for count. */ |
---|
| 560 | ldr A_l, [src, #4] |
---|
| 561 | ldr A_h, [src, #8] |
---|
| 562 | strd B_l, B_h, [sp, #8] |
---|
| 563 | ldr B_l, [src, #12] |
---|
| 564 | ldr B_h, [src, #16] |
---|
| 565 | strd C_l, C_h, [sp, #16] |
---|
| 566 | ldr C_l, [src, #20] |
---|
| 567 | ldr C_h, [src, #24] |
---|
| 568 | strd D_l, D_h, [sp, #24] |
---|
| 569 | ldr D_l, [src, #28] |
---|
| 570 | ldr D_h, [src, #32]! |
---|
| 571 | b 1f |
---|
| 572 | .p2align 6 |
---|
| 573 | 2: |
---|
| 574 | pld [src, #(5 * 64) - (32 - 4)] |
---|
| 575 | strd A_l, A_h, [dst, #40] |
---|
| 576 | ldr A_l, [src, #36] |
---|
| 577 | ldr A_h, [src, #40] |
---|
| 578 | strd B_l, B_h, [dst, #48] |
---|
| 579 | ldr B_l, [src, #44] |
---|
| 580 | ldr B_h, [src, #48] |
---|
| 581 | strd C_l, C_h, [dst, #56] |
---|
| 582 | ldr C_l, [src, #52] |
---|
| 583 | ldr C_h, [src, #56] |
---|
| 584 | strd D_l, D_h, [dst, #64]! |
---|
| 585 | ldr D_l, [src, #60] |
---|
| 586 | ldr D_h, [src, #64]! |
---|
| 587 | subs tmp2, tmp2, #64 |
---|
| 588 | 1: |
---|
| 589 | strd A_l, A_h, [dst, #8] |
---|
| 590 | ldr A_l, [src, #4] |
---|
| 591 | ldr A_h, [src, #8] |
---|
| 592 | strd B_l, B_h, [dst, #16] |
---|
| 593 | ldr B_l, [src, #12] |
---|
| 594 | ldr B_h, [src, #16] |
---|
| 595 | strd C_l, C_h, [dst, #24] |
---|
| 596 | ldr C_l, [src, #20] |
---|
| 597 | ldr C_h, [src, #24] |
---|
| 598 | strd D_l, D_h, [dst, #32] |
---|
| 599 | ldr D_l, [src, #28] |
---|
| 600 | ldr D_h, [src, #32] |
---|
| 601 | bcs 2b |
---|
| 602 | |
---|
| 603 | /* Save the remaining bytes and restore the callee-saved regs. */ |
---|
| 604 | strd A_l, A_h, [dst, #40] |
---|
| 605 | add src, src, #36 |
---|
| 606 | strd B_l, B_h, [dst, #48] |
---|
| 607 | ldrd B_l, B_h, [sp, #8] |
---|
| 608 | strd C_l, C_h, [dst, #56] |
---|
| 609 | ldrd C_l, C_h, [sp, #16] |
---|
| 610 | strd D_l, D_h, [dst, #64] |
---|
| 611 | ldrd D_l, D_h, [sp, #24] |
---|
| 612 | add dst, dst, #72 |
---|
| 613 | ands count, tmp2, #0x3f |
---|
| 614 | #endif |
---|
| 615 | ldr tmp2, [sp], #FRAME_SIZE |
---|
| 616 | bne .Ltail63unaligned |
---|
| 617 | bx lr |
---|
| 618 | |
---|
| 619 | .size memcpy, . - memcpy |
---|