1 | /* |
---|
2 | * Copyright (c) 2014 ARM Ltd |
---|
3 | * All rights reserved. |
---|
4 | * |
---|
5 | * Redistribution and use in source and binary forms, with or without |
---|
6 | * modification, are permitted provided that the following conditions |
---|
7 | * are met: |
---|
8 | * 1. Redistributions of source code must retain the above copyright |
---|
9 | * notice, this list of conditions and the following disclaimer. |
---|
10 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
11 | * notice, this list of conditions and the following disclaimer in the |
---|
12 | * documentation and/or other materials provided with the distribution. |
---|
13 | * 3. The name of the company may not be used to endorse or promote |
---|
14 | * products derived from this software without specific prior written |
---|
15 | * permission. |
---|
16 | * |
---|
17 | * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED |
---|
18 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
---|
19 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
---|
20 | * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
21 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED |
---|
22 | * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
---|
23 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
---|
24 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
---|
25 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
---|
26 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
27 | */ |
---|
28 | |
---|
29 | #include "acle-compat.h" |
---|
30 | |
---|
31 | /* NOTE: This ifdef MUST match the one in aeabi_memcpy.c. */ |
---|
32 | #if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \ |
---|
33 | (defined (__ARM_NEON__) || !defined (__SOFTFP__)) |
---|
34 | |
---|
35 | .syntax unified |
---|
36 | .global __aeabi_memcpy |
---|
37 | .type __aeabi_memcpy, %function |
---|
38 | __aeabi_memcpy: |
---|
39 | /* Assumes that n >= 0, and dst, src are valid pointers. |
---|
40 | If there is at least 8 bytes to copy, use LDRD/STRD. |
---|
41 | If src and dst are misaligned with different offsets, |
---|
42 | first copy byte by byte until dst is aligned, |
---|
43 | and then copy using LDRD/STRD and shift if needed. |
---|
44 | When less than 8 left, copy a word and then byte by byte. */ |
---|
45 | |
---|
46 | /* Save registers (r0 holds the return value): |
---|
47 | optimized push {r0, r4, r5, lr}. |
---|
48 | To try and improve performance, stack layout changed, |
---|
49 | i.e., not keeping the stack looking like users expect |
---|
50 | (highest numbered register at highest address). */ |
---|
51 | push {r0, lr} |
---|
52 | strd r4, r5, [sp, #-8]! |
---|
53 | |
---|
54 | /* Get copying of tiny blocks out of the way first. */ |
---|
55 | /* Is there at least 4 bytes to copy? */ |
---|
56 | subs r2, r2, #4 |
---|
57 | blt copy_less_than_4 /* If n < 4. */ |
---|
58 | |
---|
59 | /* Check word alignment. */ |
---|
60 | ands ip, r0, #3 /* ip = last 2 bits of dst. */ |
---|
61 | bne dst_not_word_aligned /* If dst is not word-aligned. */ |
---|
62 | |
---|
63 | /* Get here if dst is word-aligned. */ |
---|
64 | ands ip, r1, #3 /* ip = last 2 bits of src. */ |
---|
65 | bne src_not_word_aligned /* If src is not word-aligned. */ |
---|
66 | word_aligned: |
---|
67 | /* Get here if source and dst both are word-aligned. |
---|
68 | The number of bytes remaining to copy is r2+4. */ |
---|
69 | |
---|
70 | /* Is there is at least 64 bytes to copy? */ |
---|
71 | subs r2, r2, #60 |
---|
72 | blt copy_less_than_64 /* If r2 + 4 < 64. */ |
---|
73 | |
---|
74 | /* First, align the destination buffer to 8-bytes, |
---|
75 | to make sure double loads and stores don't cross cache line boundary, |
---|
76 | as they are then more expensive even if the data is in the cache |
---|
77 | (require two load/store issue cycles instead of one). |
---|
78 | If only one of the buffers is not 8-bytes aligned, |
---|
79 | then it's more important to align dst than src, |
---|
80 | because there is more penalty for stores |
---|
81 | than loads that cross cacheline boundary. |
---|
82 | This check and realignment are only worth doing |
---|
83 | if there is a lot to copy. */ |
---|
84 | |
---|
85 | /* Get here if dst is word aligned, |
---|
86 | i.e., the 2 least significant bits are 0. |
---|
87 | If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), |
---|
88 | then copy 1 word (4 bytes). */ |
---|
89 | ands r3, r0, #4 |
---|
90 | beq two_word_aligned /* If dst already two-word aligned. */ |
---|
91 | ldr r3, [r1], #4 |
---|
92 | str r3, [r0], #4 |
---|
93 | subs r2, r2, #4 |
---|
94 | blt copy_less_than_64 |
---|
95 | |
---|
96 | two_word_aligned: |
---|
97 | /* TODO: Align to cacheline (useful for PLD optimization). */ |
---|
98 | |
---|
99 | /* Every loop iteration copies 64 bytes. */ |
---|
100 | 1: |
---|
101 | .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 |
---|
102 | ldrd r4, r5, [r1, \offset] |
---|
103 | strd r4, r5, [r0, \offset] |
---|
104 | .endr |
---|
105 | |
---|
106 | add r0, r0, #64 |
---|
107 | add r1, r1, #64 |
---|
108 | subs r2, r2, #64 |
---|
109 | bge 1b /* If there is more to copy. */ |
---|
110 | |
---|
111 | copy_less_than_64: |
---|
112 | |
---|
113 | /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. |
---|
114 | Restore the count if there is more than 7 bytes to copy. */ |
---|
115 | adds r2, r2, #56 |
---|
116 | blt copy_less_than_8 |
---|
117 | |
---|
118 | /* Copy 8 bytes at a time. */ |
---|
119 | 2: |
---|
120 | ldrd r4, r5, [r1], #8 |
---|
121 | strd r4, r5, [r0], #8 |
---|
122 | subs r2, r2, #8 |
---|
123 | bge 2b /* If there is more to copy. */ |
---|
124 | |
---|
125 | copy_less_than_8: |
---|
126 | |
---|
127 | /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. |
---|
128 | Check if there is more to copy. */ |
---|
129 | cmn r2, #8 |
---|
130 | beq return /* If r2 + 8 == 0. */ |
---|
131 | |
---|
132 | /* Restore the count if there is more than 3 bytes to copy. */ |
---|
133 | adds r2, r2, #4 |
---|
134 | blt copy_less_than_4 |
---|
135 | |
---|
136 | /* Copy 4 bytes. */ |
---|
137 | ldr r3, [r1], #4 |
---|
138 | str r3, [r0], #4 |
---|
139 | |
---|
140 | copy_less_than_4: |
---|
141 | /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */ |
---|
142 | |
---|
143 | /* Restore the count, check if there is more to copy. */ |
---|
144 | adds r2, r2, #4 |
---|
145 | beq return /* If r2 == 0. */ |
---|
146 | |
---|
147 | /* Get here with r2 is in {1,2,3}={01,10,11}. */ |
---|
148 | /* Logical shift left r2, insert 0s, update flags. */ |
---|
149 | lsls r2, r2, #31 |
---|
150 | |
---|
151 | /* Copy byte by byte. |
---|
152 | Condition ne means the last bit of r2 is 0. |
---|
153 | Condition cs means the second to last bit of r2 is set, |
---|
154 | i.e., r2 is 1 or 3. */ |
---|
155 | itt ne |
---|
156 | ldrbne r3, [r1], #1 |
---|
157 | strbne r3, [r0], #1 |
---|
158 | |
---|
159 | itttt cs |
---|
160 | ldrbcs r4, [r1], #1 |
---|
161 | ldrbcs r5, [r1] |
---|
162 | strbcs r4, [r0], #1 |
---|
163 | strbcs r5, [r0] |
---|
164 | |
---|
165 | return: |
---|
166 | /* Restore registers: optimized pop {r0, r4, r5, pc} */ |
---|
167 | ldrd r4, r5, [sp], #8 |
---|
168 | pop {r0, pc} /* This is the only return point of memcpy. */ |
---|
169 | |
---|
170 | dst_not_word_aligned: |
---|
171 | |
---|
172 | /* Get here when dst is not aligned and ip has the last 2 bits of dst, |
---|
173 | i.e., ip is the offset of dst from word. |
---|
174 | The number of bytes that remains to copy is r2 + 4, |
---|
175 | i.e., there are at least 4 bytes to copy. |
---|
176 | Write a partial word (0 to 3 bytes), such that dst becomes |
---|
177 | word-aligned. */ |
---|
178 | |
---|
179 | /* If dst is at ip bytes offset from a word (with 0 < ip < 4), |
---|
180 | then there are (4 - ip) bytes to fill up to align dst to the next |
---|
181 | word. */ |
---|
182 | rsb ip, ip, #4 /* ip = #4 - ip. */ |
---|
183 | cmp ip, #2 |
---|
184 | |
---|
185 | /* Copy byte by byte with conditionals. */ |
---|
186 | itt gt |
---|
187 | ldrbgt r3, [r1], #1 |
---|
188 | strbgt r3, [r0], #1 |
---|
189 | |
---|
190 | itt ge |
---|
191 | ldrbge r4, [r1], #1 |
---|
192 | strbge r4, [r0], #1 |
---|
193 | |
---|
194 | ldrb lr, [r1], #1 |
---|
195 | strb lr, [r0], #1 |
---|
196 | |
---|
197 | /* Update the count. |
---|
198 | ip holds the number of bytes we have just copied. */ |
---|
199 | subs r2, r2, ip /* r2 = r2 - ip. */ |
---|
200 | blt copy_less_than_4 /* If r2 < ip. */ |
---|
201 | |
---|
202 | /* Get here if there are more than 4 bytes to copy. |
---|
203 | Check if src is aligned. If beforehand src and dst were not word |
---|
204 | aligned but congruent (same offset), then now they are both |
---|
205 | word-aligned, and we can copy the rest efficiently (without |
---|
206 | shifting). */ |
---|
207 | ands ip, r1, #3 /* ip = last 2 bits of src. */ |
---|
208 | beq word_aligned /* If r1 is word-aligned. */ |
---|
209 | |
---|
210 | src_not_word_aligned: |
---|
211 | /* Get here when src is not word-aligned, but dst is word-aligned. |
---|
212 | The number of bytes that remains to copy is r2+4. */ |
---|
213 | |
---|
214 | /* Copy word by word using LDR when alignment can be done in hardware, |
---|
215 | i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ |
---|
216 | subs r2, r2, #60 |
---|
217 | blt 8f |
---|
218 | |
---|
219 | 7: |
---|
220 | /* Copy 64 bytes in every loop iteration. */ |
---|
221 | .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 |
---|
222 | ldr r3, [r1, \offset] |
---|
223 | str r3, [r0, \offset] |
---|
224 | .endr |
---|
225 | |
---|
226 | add r0, r0, #64 |
---|
227 | add r1, r1, #64 |
---|
228 | subs r2, r2, #64 |
---|
229 | bge 7b |
---|
230 | |
---|
231 | 8: |
---|
232 | /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. |
---|
233 | Check if there is more than 3 bytes to copy. */ |
---|
234 | adds r2, r2, #60 |
---|
235 | blt copy_less_than_4 |
---|
236 | |
---|
237 | 9: |
---|
238 | /* Get here if there is less than 64 but at least 4 bytes to copy, |
---|
239 | where the number of bytes to copy is r2+4. */ |
---|
240 | ldr r3, [r1], #4 |
---|
241 | str r3, [r0], #4 |
---|
242 | subs r2, r2, #4 |
---|
243 | bge 9b |
---|
244 | |
---|
245 | b copy_less_than_4 |
---|
246 | |
---|
247 | |
---|
248 | .syntax unified |
---|
249 | .global __aeabi_memcpy4 |
---|
250 | .type __aeabi_memcpy4, %function |
---|
251 | __aeabi_memcpy4: |
---|
252 | /* Assumes that both of its arguments are 4-byte aligned. */ |
---|
253 | |
---|
254 | push {r0, lr} |
---|
255 | strd r4, r5, [sp, #-8]! |
---|
256 | |
---|
257 | /* Is there at least 4 bytes to copy? */ |
---|
258 | subs r2, r2, #4 |
---|
259 | blt copy_less_than_4 /* If n < 4. */ |
---|
260 | |
---|
261 | bl word_aligned |
---|
262 | |
---|
263 | .syntax unified |
---|
264 | .global __aeabi_memcpy8 |
---|
265 | .type __aeabi_memcpy8, %function |
---|
266 | __aeabi_memcpy8: |
---|
267 | /* Assumes that both of its arguments are 8-byte aligned. */ |
---|
268 | |
---|
269 | push {r0, lr} |
---|
270 | strd r4, r5, [sp, #-8]! |
---|
271 | |
---|
272 | /* Is there at least 4 bytes to copy? */ |
---|
273 | subs r2, r2, #4 |
---|
274 | blt copy_less_than_4 /* If n < 4. */ |
---|
275 | |
---|
276 | /* Is there at least 8 bytes to copy? */ |
---|
277 | subs r2, r2, #4 |
---|
278 | blt copy_less_than_8 /* If n < 8. */ |
---|
279 | |
---|
280 | /* Is there at least 64 bytes to copy? */ |
---|
281 | subs r2, r2, #56 |
---|
282 | blt copy_less_than_64 /* if n + 8 < 64. */ |
---|
283 | |
---|
284 | bl two_word_aligned |
---|
285 | |
---|
286 | #endif |
---|