Context Navigation

aeabi_memcpy-armv7a.S

Last change on this file was 444, checked in by satin@…, 6 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 10.1 KB

Line
1	/*
2	* Copyright (c) 2014 ARM Ltd
3	* All rights reserved.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	* 3. The name of the company may not be used to endorse or promote
14	* products derived from this software without specific prior written
15	* permission.
16	*
17	* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20	* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22	* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	#include "acle-compat.h"
30
31	/* NOTE: This ifdef MUST match the one in aeabi_memcpy.c. */
32	#if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
33	(defined (__ARM_NEON__) \|\| !defined (__SOFTFP__))
34
35	.syntax unified
36	.global __aeabi_memcpy
37	.type __aeabi_memcpy, %function
38	__aeabi_memcpy:
39	/* Assumes that n >= 0, and dst, src are valid pointers.
40	If there is at least 8 bytes to copy, use LDRD/STRD.
41	If src and dst are misaligned with different offsets,
42	first copy byte by byte until dst is aligned,
43	and then copy using LDRD/STRD and shift if needed.
44	When less than 8 left, copy a word and then byte by byte. */
45
46	/* Save registers (r0 holds the return value):
47	optimized push {r0, r4, r5, lr}.
48	To try and improve performance, stack layout changed,
49	i.e., not keeping the stack looking like users expect
50	(highest numbered register at highest address). */
51	push {r0, lr}
52	strd r4, r5, [sp, #-8]!
53
54	/* Get copying of tiny blocks out of the way first. */
55	/* Is there at least 4 bytes to copy? */
56	subs r2, r2, #4
57	blt copy_less_than_4 /* If n < 4. */
58
59	/* Check word alignment. */
60	ands ip, r0, #3 /* ip = last 2 bits of dst. */
61	bne dst_not_word_aligned /* If dst is not word-aligned. */
62
63	/* Get here if dst is word-aligned. */
64	ands ip, r1, #3 /* ip = last 2 bits of src. */
65	bne src_not_word_aligned /* If src is not word-aligned. */
66	word_aligned:
67	/* Get here if source and dst both are word-aligned.
68	The number of bytes remaining to copy is r2+4. */
69
70	/* Is there is at least 64 bytes to copy? */
71	subs r2, r2, #60
72	blt copy_less_than_64 /* If r2 + 4 < 64. */
73
74	/* First, align the destination buffer to 8-bytes,
75	to make sure double loads and stores don't cross cache line boundary,
76	as they are then more expensive even if the data is in the cache
77	(require two load/store issue cycles instead of one).
78	If only one of the buffers is not 8-bytes aligned,
79	then it's more important to align dst than src,
80	because there is more penalty for stores
81	than loads that cross cacheline boundary.
82	This check and realignment are only worth doing
83	if there is a lot to copy. */
84
85	/* Get here if dst is word aligned,
86	i.e., the 2 least significant bits are 0.
87	If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
88	then copy 1 word (4 bytes). */
89	ands r3, r0, #4
90	beq two_word_aligned /* If dst already two-word aligned. */
91	ldr r3, [r1], #4
92	str r3, [r0], #4
93	subs r2, r2, #4
94	blt copy_less_than_64
95
96	two_word_aligned:
97	/* TODO: Align to cacheline (useful for PLD optimization). */
98
99	/* Every loop iteration copies 64 bytes. */
100	1:
101	.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
102	ldrd r4, r5, [r1, \offset]
103	strd r4, r5, [r0, \offset]
104	.endr
105
106	add r0, r0, #64
107	add r1, r1, #64
108	subs r2, r2, #64
109	bge 1b /* If there is more to copy. */
110
111	copy_less_than_64:
112
113	/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
114	Restore the count if there is more than 7 bytes to copy. */
115	adds r2, r2, #56
116	blt copy_less_than_8
117
118	/* Copy 8 bytes at a time. */
119	2:
120	ldrd r4, r5, [r1], #8
121	strd r4, r5, [r0], #8
122	subs r2, r2, #8
123	bge 2b /* If there is more to copy. */
124
125	copy_less_than_8:
126
127	/* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
128	Check if there is more to copy. */
129	cmn r2, #8
130	beq return /* If r2 + 8 == 0. */
131
132	/* Restore the count if there is more than 3 bytes to copy. */
133	adds r2, r2, #4
134	blt copy_less_than_4
135
136	/* Copy 4 bytes. */
137	ldr r3, [r1], #4
138	str r3, [r0], #4
139
140	copy_less_than_4:
141	/* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
142
143	/* Restore the count, check if there is more to copy. */
144	adds r2, r2, #4
145	beq return /* If r2 == 0. */
146
147	/* Get here with r2 is in {1,2,3}={01,10,11}. */
148	/* Logical shift left r2, insert 0s, update flags. */
149	lsls r2, r2, #31
150
151	/* Copy byte by byte.
152	Condition ne means the last bit of r2 is 0.
153	Condition cs means the second to last bit of r2 is set,
154	i.e., r2 is 1 or 3. */
155	itt ne
156	ldrbne r3, [r1], #1
157	strbne r3, [r0], #1
158
159	itttt cs
160	ldrbcs r4, [r1], #1
161	ldrbcs r5, [r1]
162	strbcs r4, [r0], #1
163	strbcs r5, [r0]
164
165	return:
166	/* Restore registers: optimized pop {r0, r4, r5, pc} */
167	ldrd r4, r5, [sp], #8
168	pop {r0, pc} /* This is the only return point of memcpy. */
169
170	dst_not_word_aligned:
171
172	/* Get here when dst is not aligned and ip has the last 2 bits of dst,
173	i.e., ip is the offset of dst from word.
174	The number of bytes that remains to copy is r2 + 4,
175	i.e., there are at least 4 bytes to copy.
176	Write a partial word (0 to 3 bytes), such that dst becomes
177	word-aligned. */
178
179	/* If dst is at ip bytes offset from a word (with 0 < ip < 4),
180	then there are (4 - ip) bytes to fill up to align dst to the next
181	word. */
182	rsb ip, ip, #4 /* ip = #4 - ip. */
183	cmp ip, #2
184
185	/* Copy byte by byte with conditionals. */
186	itt gt
187	ldrbgt r3, [r1], #1
188	strbgt r3, [r0], #1
189
190	itt ge
191	ldrbge r4, [r1], #1
192	strbge r4, [r0], #1
193
194	ldrb lr, [r1], #1
195	strb lr, [r0], #1
196
197	/* Update the count.
198	ip holds the number of bytes we have just copied. */
199	subs r2, r2, ip /* r2 = r2 - ip. */
200	blt copy_less_than_4 /* If r2 < ip. */
201
202	/* Get here if there are more than 4 bytes to copy.
203	Check if src is aligned. If beforehand src and dst were not word
204	aligned but congruent (same offset), then now they are both
205	word-aligned, and we can copy the rest efficiently (without
206	shifting). */
207	ands ip, r1, #3 /* ip = last 2 bits of src. */
208	beq word_aligned /* If r1 is word-aligned. */
209
210	src_not_word_aligned:
211	/* Get here when src is not word-aligned, but dst is word-aligned.
212	The number of bytes that remains to copy is r2+4. */
213
214	/* Copy word by word using LDR when alignment can be done in hardware,
215	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
216	subs r2, r2, #60
217	blt 8f
218
219	7:
220	/* Copy 64 bytes in every loop iteration. */
221	.irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
222	ldr r3, [r1, \offset]
223	str r3, [r0, \offset]
224	.endr
225
226	add r0, r0, #64
227	add r1, r1, #64
228	subs r2, r2, #64
229	bge 7b
230
231	8:
232	/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
233	Check if there is more than 3 bytes to copy. */
234	adds r2, r2, #60
235	blt copy_less_than_4
236
237	9:
238	/* Get here if there is less than 64 but at least 4 bytes to copy,
239	where the number of bytes to copy is r2+4. */
240	ldr r3, [r1], #4
241	str r3, [r0], #4
242	subs r2, r2, #4
243	bge 9b
244
245	b copy_less_than_4
246
247
248	.syntax unified
249	.global __aeabi_memcpy4
250	.type __aeabi_memcpy4, %function
251	__aeabi_memcpy4:
252	/* Assumes that both of its arguments are 4-byte aligned. */
253
254	push {r0, lr}
255	strd r4, r5, [sp, #-8]!
256
257	/* Is there at least 4 bytes to copy? */
258	subs r2, r2, #4
259	blt copy_less_than_4 /* If n < 4. */
260
261	bl word_aligned
262
263	.syntax unified
264	.global __aeabi_memcpy8
265	.type __aeabi_memcpy8, %function
266	__aeabi_memcpy8:
267	/* Assumes that both of its arguments are 8-byte aligned. */
268
269	push {r0, lr}
270	strd r4, r5, [sp, #-8]!
271
272	/* Is there at least 4 bytes to copy? */
273	subs r2, r2, #4
274	blt copy_less_than_4 /* If n < 4. */
275
276	/* Is there at least 8 bytes to copy? */
277	subs r2, r2, #4
278	blt copy_less_than_8 /* If n < 8. */
279
280	/* Is there at least 64 bytes to copy? */
281	subs r2, r2, #56
282	blt copy_less_than_64 /* if n + 8 < 64. */
283
284	bl two_word_aligned
285
286	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format