1 | /* |
---|
2 | * Copyright (c) 2013 ARM Ltd |
---|
3 | * All rights reserved. |
---|
4 | * |
---|
5 | * Redistribution and use in source and binary forms, with or without |
---|
6 | * modification, are permitted provided that the following conditions |
---|
7 | * are met: |
---|
8 | * 1. Redistributions of source code must retain the above copyright |
---|
9 | * notice, this list of conditions and the following disclaimer. |
---|
10 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
11 | * notice, this list of conditions and the following disclaimer in the |
---|
12 | * documentation and/or other materials provided with the distribution. |
---|
13 | * 3. The name of the company may not be used to endorse or promote |
---|
14 | * products derived from this software without specific prior written |
---|
15 | * permission. |
---|
16 | * |
---|
17 | * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED |
---|
18 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
---|
19 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
---|
20 | * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
---|
21 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED |
---|
22 | * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
---|
23 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
---|
24 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
---|
25 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
---|
26 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
---|
27 | */ |
---|
28 | |
---|
29 | /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without |
---|
30 | unaligned access. |
---|
31 | |
---|
32 | If compiled with GCC, this file should be enclosed within following |
---|
33 | pre-processing check: |
---|
34 | if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__) |
---|
35 | |
---|
36 | Prototype: void *memcpy (void *dst, const void *src, size_t count); |
---|
37 | |
---|
38 | The job will be done in 5 steps. |
---|
39 | Step 1: Align src/dest pointers, copy mis-aligned if fail to align both |
---|
40 | Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE |
---|
41 | Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE |
---|
42 | Step 4: Copy word by word |
---|
43 | Step 5: Copy byte-to-byte |
---|
44 | |
---|
45 | Tunable options: |
---|
46 | __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64. |
---|
47 | __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16. |
---|
48 | */ |
---|
49 | #ifndef __OPT_BIG_BLOCK_SIZE |
---|
50 | #define __OPT_BIG_BLOCK_SIZE (4 * 16) |
---|
51 | #endif |
---|
52 | |
---|
53 | #ifndef __OPT_MID_BLOCK_SIZE |
---|
54 | #define __OPT_MID_BLOCK_SIZE (4 * 4) |
---|
55 | #endif |
---|
56 | |
---|
57 | #if __OPT_BIG_BLOCK_SIZE == 16 |
---|
58 | #define BEGIN_UNROLL_BIG_BLOCK \ |
---|
59 | .irp offset, 0,4,8,12 |
---|
60 | #elif __OPT_BIG_BLOCK_SIZE == 32 |
---|
61 | #define BEGIN_UNROLL_BIG_BLOCK \ |
---|
62 | .irp offset, 0,4,8,12,16,20,24,28 |
---|
63 | #elif __OPT_BIG_BLOCK_SIZE == 64 |
---|
64 | #define BEGIN_UNROLL_BIG_BLOCK \ |
---|
65 | .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60 |
---|
66 | #else |
---|
67 | #error "Illegal __OPT_BIG_BLOCK_SIZE" |
---|
68 | #endif |
---|
69 | |
---|
70 | #if __OPT_MID_BLOCK_SIZE == 8 |
---|
71 | #define BEGIN_UNROLL_MID_BLOCK \ |
---|
72 | .irp offset, 0,4 |
---|
73 | #elif __OPT_MID_BLOCK_SIZE == 16 |
---|
74 | #define BEGIN_UNROLL_MID_BLOCK \ |
---|
75 | .irp offset, 0,4,8,12 |
---|
76 | #else |
---|
77 | #error "Illegal __OPT_MID_BLOCK_SIZE" |
---|
78 | #endif |
---|
79 | |
---|
80 | #define END_UNROLL .endr |
---|
81 | |
---|
82 | .syntax unified |
---|
83 | .text |
---|
84 | .align 2 |
---|
85 | .global memcpy |
---|
86 | .thumb |
---|
87 | .thumb_func |
---|
88 | .type memcpy, %function |
---|
89 | memcpy: |
---|
90 | @ r0: dst |
---|
91 | @ r1: src |
---|
92 | @ r2: len |
---|
93 | #ifdef __ARM_FEATURE_UNALIGNED |
---|
94 | /* In case of UNALIGNED access supported, ip is not used in |
---|
95 | function body. */ |
---|
96 | mov ip, r0 |
---|
97 | #else |
---|
98 | push {r0} |
---|
99 | #endif |
---|
100 | orr r3, r1, r0 |
---|
101 | ands r3, r3, #3 |
---|
102 | bne .Lmisaligned_copy |
---|
103 | |
---|
104 | .Lbig_block: |
---|
105 | subs r2, __OPT_BIG_BLOCK_SIZE |
---|
106 | blo .Lmid_block |
---|
107 | |
---|
108 | /* Kernel loop for big block copy */ |
---|
109 | .align 2 |
---|
110 | .Lbig_block_loop: |
---|
111 | BEGIN_UNROLL_BIG_BLOCK |
---|
112 | #ifdef __ARM_ARCH_7EM__ |
---|
113 | ldr r3, [r1], #4 |
---|
114 | str r3, [r0], #4 |
---|
115 | END_UNROLL |
---|
116 | #else /* __ARM_ARCH_7M__ */ |
---|
117 | ldr r3, [r1, \offset] |
---|
118 | str r3, [r0, \offset] |
---|
119 | END_UNROLL |
---|
120 | adds r0, __OPT_BIG_BLOCK_SIZE |
---|
121 | adds r1, __OPT_BIG_BLOCK_SIZE |
---|
122 | #endif |
---|
123 | subs r2, __OPT_BIG_BLOCK_SIZE |
---|
124 | bhs .Lbig_block_loop |
---|
125 | |
---|
126 | .Lmid_block: |
---|
127 | adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE |
---|
128 | blo .Lcopy_word_by_word |
---|
129 | |
---|
130 | /* Kernel loop for mid-block copy */ |
---|
131 | .align 2 |
---|
132 | .Lmid_block_loop: |
---|
133 | BEGIN_UNROLL_MID_BLOCK |
---|
134 | #ifdef __ARM_ARCH_7EM__ |
---|
135 | ldr r3, [r1], #4 |
---|
136 | str r3, [r0], #4 |
---|
137 | END_UNROLL |
---|
138 | #else /* __ARM_ARCH_7M__ */ |
---|
139 | ldr r3, [r1, \offset] |
---|
140 | str r3, [r0, \offset] |
---|
141 | END_UNROLL |
---|
142 | adds r0, __OPT_MID_BLOCK_SIZE |
---|
143 | adds r1, __OPT_MID_BLOCK_SIZE |
---|
144 | #endif |
---|
145 | subs r2, __OPT_MID_BLOCK_SIZE |
---|
146 | bhs .Lmid_block_loop |
---|
147 | |
---|
148 | .Lcopy_word_by_word: |
---|
149 | adds r2, __OPT_MID_BLOCK_SIZE - 4 |
---|
150 | blo .Lcopy_less_than_4 |
---|
151 | |
---|
152 | /* Kernel loop for small block copy */ |
---|
153 | .align 2 |
---|
154 | .Lcopy_word_by_word_loop: |
---|
155 | ldr r3, [r1], #4 |
---|
156 | str r3, [r0], #4 |
---|
157 | subs r2, #4 |
---|
158 | bhs .Lcopy_word_by_word_loop |
---|
159 | |
---|
160 | .Lcopy_less_than_4: |
---|
161 | adds r2, #4 |
---|
162 | beq .Ldone |
---|
163 | |
---|
164 | lsls r2, r2, #31 |
---|
165 | itt ne |
---|
166 | ldrbne r3, [r1], #1 |
---|
167 | strbne r3, [r0], #1 |
---|
168 | |
---|
169 | bcc .Ldone |
---|
170 | #ifdef __ARM_FEATURE_UNALIGNED |
---|
171 | ldrh r3, [r1] |
---|
172 | strh r3, [r0] |
---|
173 | #else |
---|
174 | ldrb r3, [r1] |
---|
175 | strb r3, [r0] |
---|
176 | ldrb r3, [r1, #1] |
---|
177 | strb r3, [r0, #1] |
---|
178 | #endif /* __ARM_FEATURE_UNALIGNED */ |
---|
179 | |
---|
180 | .Ldone: |
---|
181 | #ifdef __ARM_FEATURE_UNALIGNED |
---|
182 | mov r0, ip |
---|
183 | #else |
---|
184 | pop {r0} |
---|
185 | #endif |
---|
186 | bx lr |
---|
187 | |
---|
188 | .align 2 |
---|
189 | .Lmisaligned_copy: |
---|
190 | #ifdef __ARM_FEATURE_UNALIGNED |
---|
191 | /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy |
---|
192 | once destination is adjusted to aligned. */ |
---|
193 | #define Ldst_aligned Lbig_block |
---|
194 | |
---|
195 | /* Copy word by word using LDR when alignment can be done in hardware, |
---|
196 | i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ |
---|
197 | |
---|
198 | cmp r2, #8 |
---|
199 | blo .Lbyte_copy |
---|
200 | |
---|
201 | /* if src is aligned, just go to the big block loop. */ |
---|
202 | lsls r3, r1, #30 |
---|
203 | beq .Ldst_aligned |
---|
204 | #else |
---|
205 | /* if len < 12, misalignment adjustment has more overhead than |
---|
206 | just byte-to-byte copy. Also, len must >=8 to guarantee code |
---|
207 | afterward work correctly. */ |
---|
208 | cmp r2, #12 |
---|
209 | blo .Lbyte_copy |
---|
210 | #endif /* __ARM_FEATURE_UNALIGNED */ |
---|
211 | |
---|
212 | /* Align dst only, not trying to align src. That is the because |
---|
213 | handling of aligned src and misaligned dst need more overhead than |
---|
214 | otherwise. By doing this the worst case is when initial src is aligned, |
---|
215 | additional up to 4 byte additional copy will executed, which is |
---|
216 | acceptable. */ |
---|
217 | |
---|
218 | ands r3, r0, #3 |
---|
219 | beq .Ldst_aligned |
---|
220 | |
---|
221 | rsb r3, #4 |
---|
222 | subs r2, r3 |
---|
223 | |
---|
224 | lsls r3, r3, #31 |
---|
225 | itt ne |
---|
226 | ldrbne r3, [r1], #1 |
---|
227 | strbne r3, [r0], #1 |
---|
228 | |
---|
229 | bcc .Ldst_aligned |
---|
230 | |
---|
231 | #ifdef __ARM_FEATURE_UNALIGNED |
---|
232 | ldrh r3, [r1], #2 |
---|
233 | strh r3, [r0], #2 |
---|
234 | b .Ldst_aligned |
---|
235 | #else |
---|
236 | ldrb r3, [r1], #1 |
---|
237 | strb r3, [r0], #1 |
---|
238 | ldrb r3, [r1], #1 |
---|
239 | strb r3, [r0], #1 |
---|
240 | /* Now that dst is aligned */ |
---|
241 | .Ldst_aligned: |
---|
242 | /* if r1 is aligned now, it means r0/r1 has the same misalignment, |
---|
243 | and they are both aligned now. Go aligned copy. */ |
---|
244 | ands r3, r1, #3 |
---|
245 | beq .Lbig_block |
---|
246 | |
---|
247 | /* dst is aligned, but src isn't. Misaligned copy. */ |
---|
248 | |
---|
249 | push {r4, r5} |
---|
250 | subs r2, #4 |
---|
251 | |
---|
252 | /* Backward r1 by misaligned bytes, to make r1 aligned. |
---|
253 | Since we need to restore r1 to unaligned address after the loop, |
---|
254 | we need keep the offset bytes to ip and sub it from r1 afterward. */ |
---|
255 | subs r1, r3 |
---|
256 | rsb ip, r3, #4 |
---|
257 | |
---|
258 | /* Pre-load on word */ |
---|
259 | ldr r4, [r1], #4 |
---|
260 | |
---|
261 | cmp r3, #2 |
---|
262 | beq .Lmisaligned_copy_2_2 |
---|
263 | cmp r3, #3 |
---|
264 | beq .Lmisaligned_copy_3_1 |
---|
265 | |
---|
266 | .macro mis_src_copy shift |
---|
267 | 1: |
---|
268 | #ifdef __ARM_BIG_ENDIAN |
---|
269 | lsls r4, r4, \shift |
---|
270 | #else |
---|
271 | lsrs r4, r4, \shift |
---|
272 | #endif |
---|
273 | ldr r3, [r1], #4 |
---|
274 | #ifdef __ARM_BIG_ENDIAN |
---|
275 | lsrs r5, r3, 32-\shift |
---|
276 | #else |
---|
277 | lsls r5, r3, 32-\shift |
---|
278 | #endif |
---|
279 | orr r4, r4, r5 |
---|
280 | str r4, [r0], #4 |
---|
281 | mov r4, r3 |
---|
282 | subs r2, #4 |
---|
283 | bhs 1b |
---|
284 | .endm |
---|
285 | |
---|
286 | .Lmisaligned_copy_1_3: |
---|
287 | mis_src_copy shift=8 |
---|
288 | b .Lsrc_misaligned_tail |
---|
289 | |
---|
290 | .Lmisaligned_copy_3_1: |
---|
291 | mis_src_copy shift=24 |
---|
292 | b .Lsrc_misaligned_tail |
---|
293 | |
---|
294 | .Lmisaligned_copy_2_2: |
---|
295 | /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */ |
---|
296 | mis_src_copy shift=16 |
---|
297 | |
---|
298 | .Lsrc_misaligned_tail: |
---|
299 | adds r2, #4 |
---|
300 | subs r1, ip |
---|
301 | pop {r4, r5} |
---|
302 | |
---|
303 | #endif /* __ARM_FEATURE_UNALIGNED */ |
---|
304 | |
---|
305 | .Lbyte_copy: |
---|
306 | subs r2, #4 |
---|
307 | blo .Lcopy_less_than_4 |
---|
308 | |
---|
309 | .Lbyte_copy_loop: |
---|
310 | subs r2, #1 |
---|
311 | ldrb r3, [r1], #1 |
---|
312 | strb r3, [r0], #1 |
---|
313 | bhs .Lbyte_copy_loop |
---|
314 | |
---|
315 | ldrb r3, [r1] |
---|
316 | strb r3, [r0] |
---|
317 | ldrb r3, [r1, #1] |
---|
318 | strb r3, [r0, #1] |
---|
319 | ldrb r3, [r1, #2] |
---|
320 | strb r3, [r0, #2] |
---|
321 | |
---|
322 | #ifdef __ARM_FEATURE_UNALIGNED |
---|
323 | mov r0, ip |
---|
324 | #else |
---|
325 | pop {r0} |
---|
326 | #endif |
---|
327 | bx lr |
---|
328 | |
---|
329 | .size memcpy, .-memcpy |
---|