Context Navigation

memcpy-armv7m.S

Last change on this file was 444, checked in by satin@…, 6 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 7.7 KB

Line
1	/*
2	* Copyright (c) 2013 ARM Ltd
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	* 3. The name of the company may not be used to endorse or promote
14	* products derived from this software without specific prior written
15	* permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22	* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
30	unaligned access.
31
32	If compiled with GCC, this file should be enclosed within following
33	pre-processing check:
34	if defined (__ARM_ARCH_7M__) \|\| defined (__ARM_ARCH_7EM__)
35
36	Prototype: void memcpy (void dst, const void *src, size_t count);
37
38	The job will be done in 5 steps.
39	Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
40	Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
41	Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
42	Step 4: Copy word by word
43	Step 5: Copy byte-to-byte
44
45	Tunable options:
46	__OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64.
47	__OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16.
48	*/
49	#ifndef __OPT_BIG_BLOCK_SIZE
50	#define __OPT_BIG_BLOCK_SIZE (4 * 16)
51	#endif
52
53	#ifndef __OPT_MID_BLOCK_SIZE
54	#define __OPT_MID_BLOCK_SIZE (4 * 4)
55	#endif
56
57	#if __OPT_BIG_BLOCK_SIZE == 16
58	#define BEGIN_UNROLL_BIG_BLOCK \
59	.irp offset, 0,4,8,12
60	#elif __OPT_BIG_BLOCK_SIZE == 32
61	#define BEGIN_UNROLL_BIG_BLOCK \
62	.irp offset, 0,4,8,12,16,20,24,28
63	#elif __OPT_BIG_BLOCK_SIZE == 64
64	#define BEGIN_UNROLL_BIG_BLOCK \
65	.irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
66	#else
67	#error "Illegal __OPT_BIG_BLOCK_SIZE"
68	#endif
69
70	#if __OPT_MID_BLOCK_SIZE == 8
71	#define BEGIN_UNROLL_MID_BLOCK \
72	.irp offset, 0,4
73	#elif __OPT_MID_BLOCK_SIZE == 16
74	#define BEGIN_UNROLL_MID_BLOCK \
75	.irp offset, 0,4,8,12
76	#else
77	#error "Illegal __OPT_MID_BLOCK_SIZE"
78	#endif
79
80	#define END_UNROLL .endr
81
82	.syntax unified
83	.text
84	.align 2
85	.global memcpy
86	.thumb
87	.thumb_func
88	.type memcpy, %function
89	memcpy:
90	@ r0: dst
91	@ r1: src
92	@ r2: len
93	#ifdef __ARM_FEATURE_UNALIGNED
94	/* In case of UNALIGNED access supported, ip is not used in
95	function body. */
96	mov ip, r0
97	#else
98	push {r0}
99	#endif
100	orr r3, r1, r0
101	ands r3, r3, #3
102	bne .Lmisaligned_copy
103
104	.Lbig_block:
105	subs r2, __OPT_BIG_BLOCK_SIZE
106	blo .Lmid_block
107
108	/* Kernel loop for big block copy */
109	.align 2
110	.Lbig_block_loop:
111	BEGIN_UNROLL_BIG_BLOCK
112	#ifdef __ARM_ARCH_7EM__
113	ldr r3, [r1], #4
114	str r3, [r0], #4
115	END_UNROLL
116	#else /* __ARM_ARCH_7M__ */
117	ldr r3, [r1, \offset]
118	str r3, [r0, \offset]
119	END_UNROLL
120	adds r0, __OPT_BIG_BLOCK_SIZE
121	adds r1, __OPT_BIG_BLOCK_SIZE
122	#endif
123	subs r2, __OPT_BIG_BLOCK_SIZE
124	bhs .Lbig_block_loop
125
126	.Lmid_block:
127	adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
128	blo .Lcopy_word_by_word
129
130	/* Kernel loop for mid-block copy */
131	.align 2
132	.Lmid_block_loop:
133	BEGIN_UNROLL_MID_BLOCK
134	#ifdef __ARM_ARCH_7EM__
135	ldr r3, [r1], #4
136	str r3, [r0], #4
137	END_UNROLL
138	#else /* __ARM_ARCH_7M__ */
139	ldr r3, [r1, \offset]
140	str r3, [r0, \offset]
141	END_UNROLL
142	adds r0, __OPT_MID_BLOCK_SIZE
143	adds r1, __OPT_MID_BLOCK_SIZE
144	#endif
145	subs r2, __OPT_MID_BLOCK_SIZE
146	bhs .Lmid_block_loop
147
148	.Lcopy_word_by_word:
149	adds r2, __OPT_MID_BLOCK_SIZE - 4
150	blo .Lcopy_less_than_4
151
152	/* Kernel loop for small block copy */
153	.align 2
154	.Lcopy_word_by_word_loop:
155	ldr r3, [r1], #4
156	str r3, [r0], #4
157	subs r2, #4
158	bhs .Lcopy_word_by_word_loop
159
160	.Lcopy_less_than_4:
161	adds r2, #4
162	beq .Ldone
163
164	lsls r2, r2, #31
165	itt ne
166	ldrbne r3, [r1], #1
167	strbne r3, [r0], #1
168
169	bcc .Ldone
170	#ifdef __ARM_FEATURE_UNALIGNED
171	ldrh r3, [r1]
172	strh r3, [r0]
173	#else
174	ldrb r3, [r1]
175	strb r3, [r0]
176	ldrb r3, [r1, #1]
177	strb r3, [r0, #1]
178	#endif /* __ARM_FEATURE_UNALIGNED */
179
180	.Ldone:
181	#ifdef __ARM_FEATURE_UNALIGNED
182	mov r0, ip
183	#else
184	pop {r0}
185	#endif
186	bx lr
187
188	.align 2
189	.Lmisaligned_copy:
190	#ifdef __ARM_FEATURE_UNALIGNED
191	/* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy
192	once destination is adjusted to aligned. */
193	#define Ldst_aligned Lbig_block
194
195	/* Copy word by word using LDR when alignment can be done in hardware,
196	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
197
198	cmp r2, #8
199	blo .Lbyte_copy
200
201	/* if src is aligned, just go to the big block loop. */
202	lsls r3, r1, #30
203	beq .Ldst_aligned
204	#else
205	/* if len < 12, misalignment adjustment has more overhead than
206	just byte-to-byte copy. Also, len must >=8 to guarantee code
207	afterward work correctly. */
208	cmp r2, #12
209	blo .Lbyte_copy
210	#endif /* __ARM_FEATURE_UNALIGNED */
211
212	/* Align dst only, not trying to align src. That is the because
213	handling of aligned src and misaligned dst need more overhead than
214	otherwise. By doing this the worst case is when initial src is aligned,
215	additional up to 4 byte additional copy will executed, which is
216	acceptable. */
217
218	ands r3, r0, #3
219	beq .Ldst_aligned
220
221	rsb r3, #4
222	subs r2, r3
223
224	lsls r3, r3, #31
225	itt ne
226	ldrbne r3, [r1], #1
227	strbne r3, [r0], #1
228
229	bcc .Ldst_aligned
230
231	#ifdef __ARM_FEATURE_UNALIGNED
232	ldrh r3, [r1], #2
233	strh r3, [r0], #2
234	b .Ldst_aligned
235	#else
236	ldrb r3, [r1], #1
237	strb r3, [r0], #1
238	ldrb r3, [r1], #1
239	strb r3, [r0], #1
240	/* Now that dst is aligned */
241	.Ldst_aligned:
242	/* if r1 is aligned now, it means r0/r1 has the same misalignment,
243	and they are both aligned now. Go aligned copy. */
244	ands r3, r1, #3
245	beq .Lbig_block
246
247	/* dst is aligned, but src isn't. Misaligned copy. */
248
249	push {r4, r5}
250	subs r2, #4
251
252	/* Backward r1 by misaligned bytes, to make r1 aligned.
253	Since we need to restore r1 to unaligned address after the loop,
254	we need keep the offset bytes to ip and sub it from r1 afterward. */
255	subs r1, r3
256	rsb ip, r3, #4
257
258	/* Pre-load on word */
259	ldr r4, [r1], #4
260
261	cmp r3, #2
262	beq .Lmisaligned_copy_2_2
263	cmp r3, #3
264	beq .Lmisaligned_copy_3_1
265
266	.macro mis_src_copy shift
267	1:
268	#ifdef __ARM_BIG_ENDIAN
269	lsls r4, r4, \shift
270	#else
271	lsrs r4, r4, \shift
272	#endif
273	ldr r3, [r1], #4
274	#ifdef __ARM_BIG_ENDIAN
275	lsrs r5, r3, 32-\shift
276	#else
277	lsls r5, r3, 32-\shift
278	#endif
279	orr r4, r4, r5
280	str r4, [r0], #4
281	mov r4, r3
282	subs r2, #4
283	bhs 1b
284	.endm
285
286	.Lmisaligned_copy_1_3:
287	mis_src_copy shift=8
288	b .Lsrc_misaligned_tail
289
290	.Lmisaligned_copy_3_1:
291	mis_src_copy shift=24
292	b .Lsrc_misaligned_tail
293
294	.Lmisaligned_copy_2_2:
295	/* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */
296	mis_src_copy shift=16
297
298	.Lsrc_misaligned_tail:
299	adds r2, #4
300	subs r1, ip
301	pop {r4, r5}
302
303	#endif /* __ARM_FEATURE_UNALIGNED */
304
305	.Lbyte_copy:
306	subs r2, #4
307	blo .Lcopy_less_than_4
308
309	.Lbyte_copy_loop:
310	subs r2, #1
311	ldrb r3, [r1], #1
312	strb r3, [r0], #1
313	bhs .Lbyte_copy_loop
314
315	ldrb r3, [r1]
316	strb r3, [r0]
317	ldrb r3, [r1, #1]
318	strb r3, [r0, #1]
319	ldrb r3, [r1, #2]
320	strb r3, [r0, #2]
321
322	#ifdef __ARM_FEATURE_UNALIGNED
323	mov r0, ip
324	#else
325	pop {r0}
326	#endif
327	bx lr
328
329	.size memcpy, .-memcpy

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format