Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

memcpy-armv7a.S @ 690

Last change on this file since 690 was 444, checked in by satin@…, 6 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 15.2 KB

Line
1	/* Copyright (c) 2013, Linaro Limited
2	All rights reserved.
3
4	Redistribution and use in source and binary forms, with or without
5	modification, are permitted provided that the following conditions
6	are met:
7
8	* Redistributions of source code must retain the above copyright
9	notice, this list of conditions and the following disclaimer.
10
11	* Redistributions in binary form must reproduce the above copyright
12	notice, this list of conditions and the following disclaimer in the
13	documentation and/or other materials provided with the distribution.
14
15	* Neither the name of Linaro Limited nor the names of its
16	contributors may be used to endorse or promote products derived
17	from this software without specific prior written permission.
18
19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
32	of VFP or NEON when built with the appropriate flags.
33
34	Assumptions:
35
36	ARMv6 (ARMv7-a if using Neon)
37	ARM state
38	Unaligned accesses
39	LDRD/STRD support unaligned word accesses
40
41	If compiled with GCC, this file should be enclosed within following
42	pre-processing check:
43	if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED)
44
45	*/
46	.syntax unified
47	/* This implementation requires ARM state. */
48	.arm
49
50	#ifdef __ARM_NEON__
51
52	.fpu neon
53	.arch armv7-a
54	# define FRAME_SIZE 4
55	# define USE_VFP
56	# define USE_NEON
57
58	#elif !defined (__SOFTFP__)
59
60	.arch armv6
61	.fpu vfpv2
62	# define FRAME_SIZE 32
63	# define USE_VFP
64
65	#else
66	.arch armv6
67	# define FRAME_SIZE 32
68
69	#endif
70
71	/* Old versions of GAS incorrectly implement the NEON align semantics. */
72	#ifdef BROKEN_ASM_NEON_ALIGN
73	#define ALIGN(addr, align) addr,:align
74	#else
75	#define ALIGN(addr, align) addr:align
76	#endif
77
78	#define PC_OFFSET 8 /* PC pipeline compensation. */
79	#define INSN_SIZE 4
80
81	/* Call parameters. */
82	#define dstin r0
83	#define src r1
84	#define count r2
85
86	/* Locals. */
87	#define tmp1 r3
88	#define dst ip
89	#define tmp2 r10
90
91	#ifndef USE_NEON
92	/* For bulk copies using GP registers. */
93	#define A_l r2 /* Call-clobbered. */
94	#define A_h r3 /* Call-clobbered. */
95	#define B_l r4
96	#define B_h r5
97	#define C_l r6
98	#define C_h r7
99	#define D_l r8
100	#define D_h r9
101	#endif
102
103	/* Number of lines ahead to pre-fetch data. If you change this the code
104	below will need adjustment to compensate. */
105
106	#define prefetch_lines 5
107
108	#ifdef USE_VFP
109	.macro cpy_line_vfp vreg, base
110	vstr \vreg, [dst, #\base]
111	vldr \vreg, [src, #\base]
112	vstr d0, [dst, #\base + 8]
113	vldr d0, [src, #\base + 8]
114	vstr d1, [dst, #\base + 16]
115	vldr d1, [src, #\base + 16]
116	vstr d2, [dst, #\base + 24]
117	vldr d2, [src, #\base + 24]
118	vstr \vreg, [dst, #\base + 32]
119	vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
120	vstr d0, [dst, #\base + 40]
121	vldr d0, [src, #\base + 40]
122	vstr d1, [dst, #\base + 48]
123	vldr d1, [src, #\base + 48]
124	vstr d2, [dst, #\base + 56]
125	vldr d2, [src, #\base + 56]
126	.endm
127
128	.macro cpy_tail_vfp vreg, base
129	vstr \vreg, [dst, #\base]
130	vldr \vreg, [src, #\base]
131	vstr d0, [dst, #\base + 8]
132	vldr d0, [src, #\base + 8]
133	vstr d1, [dst, #\base + 16]
134	vldr d1, [src, #\base + 16]
135	vstr d2, [dst, #\base + 24]
136	vldr d2, [src, #\base + 24]
137	vstr \vreg, [dst, #\base + 32]
138	vstr d0, [dst, #\base + 40]
139	vldr d0, [src, #\base + 40]
140	vstr d1, [dst, #\base + 48]
141	vldr d1, [src, #\base + 48]
142	vstr d2, [dst, #\base + 56]
143	vldr d2, [src, #\base + 56]
144	.endm
145	#endif
146
147	.macro def_fn f p2align=0
148	.text
149	.p2align \p2align
150	.global \f
151	.type \f, %function
152	\f:
153	.endm
154
155	def_fn memcpy p2align=6
156
157	mov dst, dstin /* Preserve dstin, we need to return it. */
158	cmp count, #64
159	bge .Lcpy_not_short
160	/* Deal with small copies quickly by dropping straight into the
161	exit block. */
162
163	.Ltail63unaligned:
164	#ifdef USE_NEON
165	and tmp1, count, #0x38
166	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
167	add pc, pc, tmp1
168	vld1.8 {d0}, [src]! /* 14 words to go. */
169	vst1.8 {d0}, [dst]!
170	vld1.8 {d0}, [src]! /* 12 words to go. */
171	vst1.8 {d0}, [dst]!
172	vld1.8 {d0}, [src]! /* 10 words to go. */
173	vst1.8 {d0}, [dst]!
174	vld1.8 {d0}, [src]! /* 8 words to go. */
175	vst1.8 {d0}, [dst]!
176	vld1.8 {d0}, [src]! /* 6 words to go. */
177	vst1.8 {d0}, [dst]!
178	vld1.8 {d0}, [src]! /* 4 words to go. */
179	vst1.8 {d0}, [dst]!
180	vld1.8 {d0}, [src]! /* 2 words to go. */
181	vst1.8 {d0}, [dst]!
182
183	tst count, #4
184	ldrne tmp1, [src], #4
185	strne tmp1, [dst], #4
186	#else
187	/* Copy up to 15 full words of data. May not be aligned. */
188	/* Cannot use VFP for unaligned data. */
189	and tmp1, count, #0x3c
190	add dst, dst, tmp1
191	add src, src, tmp1
192	rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
193	/* Jump directly into the sequence below at the correct offset. */
194	add pc, pc, tmp1, lsl #1
195
196	ldr tmp1, [src, #-60] /* 15 words to go. */
197	str tmp1, [dst, #-60]
198
199	ldr tmp1, [src, #-56] /* 14 words to go. */
200	str tmp1, [dst, #-56]
201	ldr tmp1, [src, #-52]
202	str tmp1, [dst, #-52]
203
204	ldr tmp1, [src, #-48] /* 12 words to go. */
205	str tmp1, [dst, #-48]
206	ldr tmp1, [src, #-44]
207	str tmp1, [dst, #-44]
208
209	ldr tmp1, [src, #-40] /* 10 words to go. */
210	str tmp1, [dst, #-40]
211	ldr tmp1, [src, #-36]
212	str tmp1, [dst, #-36]
213
214	ldr tmp1, [src, #-32] /* 8 words to go. */
215	str tmp1, [dst, #-32]
216	ldr tmp1, [src, #-28]
217	str tmp1, [dst, #-28]
218
219	ldr tmp1, [src, #-24] /* 6 words to go. */
220	str tmp1, [dst, #-24]
221	ldr tmp1, [src, #-20]
222	str tmp1, [dst, #-20]
223
224	ldr tmp1, [src, #-16] /* 4 words to go. */
225	str tmp1, [dst, #-16]
226	ldr tmp1, [src, #-12]
227	str tmp1, [dst, #-12]
228
229	ldr tmp1, [src, #-8] /* 2 words to go. */
230	str tmp1, [dst, #-8]
231	ldr tmp1, [src, #-4]
232	str tmp1, [dst, #-4]
233	#endif
234
235	lsls count, count, #31
236	ldrhcs tmp1, [src], #2
237	ldrbne src, [src] /* Src is dead, use as a scratch. */
238	strhcs tmp1, [dst], #2
239	strbne src, [dst]
240	bx lr
241
242	.Lcpy_not_short:
243	/* At least 64 bytes to copy, but don't know the alignment yet. */
244	str tmp2, [sp, #-FRAME_SIZE]!
245	and tmp2, src, #7
246	and tmp1, dst, #7
247	cmp tmp1, tmp2
248	bne .Lcpy_notaligned
249
250	#ifdef USE_VFP
251	/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
252	that the FP pipeline is much better at streaming loads and
253	stores. This is outside the critical loop. */
254	vmov.f32 s0, s0
255	#endif
256
257	/* SRC and DST have the same mutual 32-bit alignment, but we may
258	still need to pre-copy some bytes to get to natural alignment.
259	We bring DST into full 64-bit alignment. */
260	lsls tmp2, dst, #29
261	beq 1f
262	rsbs tmp2, tmp2, #0
263	sub count, count, tmp2, lsr #29
264	ldrmi tmp1, [src], #4
265	strmi tmp1, [dst], #4
266	lsls tmp2, tmp2, #2
267	ldrhcs tmp1, [src], #2
268	ldrbne tmp2, [src], #1
269	strhcs tmp1, [dst], #2
270	strbne tmp2, [dst], #1
271
272	1:
273	subs tmp2, count, #64 /* Use tmp2 for count. */
274	blt .Ltail63aligned
275
276	cmp tmp2, #512
277	bge .Lcpy_body_long
278
279	.Lcpy_body_medium: /* Count in tmp2. */
280	#ifdef USE_VFP
281	1:
282	vldr d0, [src, #0]
283	subs tmp2, tmp2, #64
284	vldr d1, [src, #8]
285	vstr d0, [dst, #0]
286	vldr d0, [src, #16]
287	vstr d1, [dst, #8]
288	vldr d1, [src, #24]
289	vstr d0, [dst, #16]
290	vldr d0, [src, #32]
291	vstr d1, [dst, #24]
292	vldr d1, [src, #40]
293	vstr d0, [dst, #32]
294	vldr d0, [src, #48]
295	vstr d1, [dst, #40]
296	vldr d1, [src, #56]
297	vstr d0, [dst, #48]
298	add src, src, #64
299	vstr d1, [dst, #56]
300	add dst, dst, #64
301	bge 1b
302	tst tmp2, #0x3f
303	beq .Ldone
304
305	.Ltail63aligned: /* Count in tmp2. */
306	and tmp1, tmp2, #0x38
307	add dst, dst, tmp1
308	add src, src, tmp1
309	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
310	add pc, pc, tmp1
311
312	vldr d0, [src, #-56] /* 14 words to go. */
313	vstr d0, [dst, #-56]
314	vldr d0, [src, #-48] /* 12 words to go. */
315	vstr d0, [dst, #-48]
316	vldr d0, [src, #-40] /* 10 words to go. */
317	vstr d0, [dst, #-40]
318	vldr d0, [src, #-32] /* 8 words to go. */
319	vstr d0, [dst, #-32]
320	vldr d0, [src, #-24] /* 6 words to go. */
321	vstr d0, [dst, #-24]
322	vldr d0, [src, #-16] /* 4 words to go. */
323	vstr d0, [dst, #-16]
324	vldr d0, [src, #-8] /* 2 words to go. */
325	vstr d0, [dst, #-8]
326	#else
327	sub src, src, #8
328	sub dst, dst, #8
329	1:
330	ldrd A_l, A_h, [src, #8]
331	strd A_l, A_h, [dst, #8]
332	ldrd A_l, A_h, [src, #16]
333	strd A_l, A_h, [dst, #16]
334	ldrd A_l, A_h, [src, #24]
335	strd A_l, A_h, [dst, #24]
336	ldrd A_l, A_h, [src, #32]
337	strd A_l, A_h, [dst, #32]
338	ldrd A_l, A_h, [src, #40]
339	strd A_l, A_h, [dst, #40]
340	ldrd A_l, A_h, [src, #48]
341	strd A_l, A_h, [dst, #48]
342	ldrd A_l, A_h, [src, #56]
343	strd A_l, A_h, [dst, #56]
344	ldrd A_l, A_h, [src, #64]!
345	strd A_l, A_h, [dst, #64]!
346	subs tmp2, tmp2, #64
347	bge 1b
348	tst tmp2, #0x3f
349	bne 1f
350	ldr tmp2,[sp], #FRAME_SIZE
351	bx lr
352	1:
353	add src, src, #8
354	add dst, dst, #8
355
356	.Ltail63aligned: /* Count in tmp2. */
357	/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
358	we know that the src and dest are 32-bit aligned so we can use
359	LDRD/STRD to improve efficiency. */
360	/* TMP2 is now negative, but we don't care about that. The bottom
361	six bits still tell us how many bytes are left to copy. */
362
363	and tmp1, tmp2, #0x38
364	add dst, dst, tmp1
365	add src, src, tmp1
366	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
367	add pc, pc, tmp1
368	ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
369	strd A_l, A_h, [dst, #-56]
370	ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
371	strd A_l, A_h, [dst, #-48]
372	ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
373	strd A_l, A_h, [dst, #-40]
374	ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
375	strd A_l, A_h, [dst, #-32]
376	ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
377	strd A_l, A_h, [dst, #-24]
378	ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
379	strd A_l, A_h, [dst, #-16]
380	ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
381	strd A_l, A_h, [dst, #-8]
382
383	#endif
384	tst tmp2, #4
385	ldrne tmp1, [src], #4
386	strne tmp1, [dst], #4
387	lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
388	ldrhcs tmp1, [src], #2
389	ldrbne tmp2, [src]
390	strhcs tmp1, [dst], #2
391	strbne tmp2, [dst]
392
393	.Ldone:
394	ldr tmp2, [sp], #FRAME_SIZE
395	bx lr
396
397	.Lcpy_body_long: /* Count in tmp2. */
398
399	/* Long copy. We know that there's at least (prefetch_lines * 64)
400	bytes to go. */
401	#ifdef USE_VFP
402	/* Don't use PLD. Instead, read some data in advance of the current
403	copy position into a register. This should act like a PLD
404	operation but we won't have to repeat the transfer. */
405
406	vldr d3, [src, #0]
407	vldr d4, [src, #64]
408	vldr d5, [src, #128]
409	vldr d6, [src, #192]
410	vldr d7, [src, #256]
411
412	vldr d0, [src, #8]
413	vldr d1, [src, #16]
414	vldr d2, [src, #24]
415	add src, src, #32
416
417	subs tmp2, tmp2, #prefetch_lines * 64 * 2
418	blt 2f
419	1:
420	cpy_line_vfp d3, 0
421	cpy_line_vfp d4, 64
422	cpy_line_vfp d5, 128
423	add dst, dst, #3 * 64
424	add src, src, #3 * 64
425	cpy_line_vfp d6, 0
426	cpy_line_vfp d7, 64
427	add dst, dst, #2 * 64
428	add src, src, #2 * 64
429	subs tmp2, tmp2, #prefetch_lines * 64
430	bge 1b
431
432	2:
433	cpy_tail_vfp d3, 0
434	cpy_tail_vfp d4, 64
435	cpy_tail_vfp d5, 128
436	add src, src, #3 * 64
437	add dst, dst, #3 * 64
438	cpy_tail_vfp d6, 0
439	vstr d7, [dst, #64]
440	vldr d7, [src, #64]
441	vstr d0, [dst, #64 + 8]
442	vldr d0, [src, #64 + 8]
443	vstr d1, [dst, #64 + 16]
444	vldr d1, [src, #64 + 16]
445	vstr d2, [dst, #64 + 24]
446	vldr d2, [src, #64 + 24]
447	vstr d7, [dst, #64 + 32]
448	add src, src, #96
449	vstr d0, [dst, #64 + 40]
450	vstr d1, [dst, #64 + 48]
451	vstr d2, [dst, #64 + 56]
452	add dst, dst, #128
453	add tmp2, tmp2, #prefetch_lines * 64
454	b .Lcpy_body_medium
455	#else
456	/* Long copy. Use an SMS style loop to maximize the I/O
457	bandwidth of the core. We don't have enough spare registers
458	to synthesise prefetching, so use PLD operations. */
459	/* Pre-bias src and dst. */
460	sub src, src, #8
461	sub dst, dst, #8
462	pld [src, #8]
463	pld [src, #72]
464	subs tmp2, tmp2, #64
465	pld [src, #136]
466	ldrd A_l, A_h, [src, #8]
467	strd B_l, B_h, [sp, #8]
468	ldrd B_l, B_h, [src, #16]
469	strd C_l, C_h, [sp, #16]
470	ldrd C_l, C_h, [src, #24]
471	strd D_l, D_h, [sp, #24]
472	pld [src, #200]
473	ldrd D_l, D_h, [src, #32]!
474	b 1f
475	.p2align 6
476	2:
477	pld [src, #232]
478	strd A_l, A_h, [dst, #40]
479	ldrd A_l, A_h, [src, #40]
480	strd B_l, B_h, [dst, #48]
481	ldrd B_l, B_h, [src, #48]
482	strd C_l, C_h, [dst, #56]
483	ldrd C_l, C_h, [src, #56]
484	strd D_l, D_h, [dst, #64]!
485	ldrd D_l, D_h, [src, #64]!
486	subs tmp2, tmp2, #64
487	1:
488	strd A_l, A_h, [dst, #8]
489	ldrd A_l, A_h, [src, #8]
490	strd B_l, B_h, [dst, #16]
491	ldrd B_l, B_h, [src, #16]
492	strd C_l, C_h, [dst, #24]
493	ldrd C_l, C_h, [src, #24]
494	strd D_l, D_h, [dst, #32]
495	ldrd D_l, D_h, [src, #32]
496	bcs 2b
497	/* Save the remaining bytes and restore the callee-saved regs. */
498	strd A_l, A_h, [dst, #40]
499	add src, src, #40
500	strd B_l, B_h, [dst, #48]
501	ldrd B_l, B_h, [sp, #8]
502	strd C_l, C_h, [dst, #56]
503	ldrd C_l, C_h, [sp, #16]
504	strd D_l, D_h, [dst, #64]
505	ldrd D_l, D_h, [sp, #24]
506	add dst, dst, #72
507	tst tmp2, #0x3f
508	bne .Ltail63aligned
509	ldr tmp2, [sp], #FRAME_SIZE
510	bx lr
511	#endif
512
513	.Lcpy_notaligned:
514	pld [src]
515	pld [src, #64]
516	/* There's at least 64 bytes to copy, but there is no mutual
517	alignment. */
518	/* Bring DST to 64-bit alignment. */
519	lsls tmp2, dst, #29
520	pld [src, #(2 * 64)]
521	beq 1f
522	rsbs tmp2, tmp2, #0
523	sub count, count, tmp2, lsr #29
524	ldrmi tmp1, [src], #4
525	strmi tmp1, [dst], #4
526	lsls tmp2, tmp2, #2
527	ldrbne tmp1, [src], #1
528	ldrhcs tmp2, [src], #2
529	strbne tmp1, [dst], #1
530	strhcs tmp2, [dst], #2
531	1:
532	pld [src, #(3 * 64)]
533	subs count, count, #64
534	ldrmi tmp2, [sp], #FRAME_SIZE
535	bmi .Ltail63unaligned
536	pld [src, #(4 * 64)]
537
538	#ifdef USE_NEON
539	vld1.8 {d0-d3}, [src]!
540	vld1.8 {d4-d7}, [src]!
541	subs count, count, #64
542	bmi 2f
543	1:
544	pld [src, #(4 * 64)]
545	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
546	vld1.8 {d0-d3}, [src]!
547	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
548	vld1.8 {d4-d7}, [src]!
549	subs count, count, #64
550	bpl 1b
551	2:
552	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
553	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
554	ands count, count, #0x3f
555	#else
556	/* Use an SMS style loop to maximize the I/O bandwidth. */
557	sub src, src, #4
558	sub dst, dst, #8
559	subs tmp2, count, #64 /* Use tmp2 for count. */
560	ldr A_l, [src, #4]
561	ldr A_h, [src, #8]
562	strd B_l, B_h, [sp, #8]
563	ldr B_l, [src, #12]
564	ldr B_h, [src, #16]
565	strd C_l, C_h, [sp, #16]
566	ldr C_l, [src, #20]
567	ldr C_h, [src, #24]
568	strd D_l, D_h, [sp, #24]
569	ldr D_l, [src, #28]
570	ldr D_h, [src, #32]!
571	b 1f
572	.p2align 6
573	2:
574	pld [src, #(5 * 64) - (32 - 4)]
575	strd A_l, A_h, [dst, #40]
576	ldr A_l, [src, #36]
577	ldr A_h, [src, #40]
578	strd B_l, B_h, [dst, #48]
579	ldr B_l, [src, #44]
580	ldr B_h, [src, #48]
581	strd C_l, C_h, [dst, #56]
582	ldr C_l, [src, #52]
583	ldr C_h, [src, #56]
584	strd D_l, D_h, [dst, #64]!
585	ldr D_l, [src, #60]
586	ldr D_h, [src, #64]!
587	subs tmp2, tmp2, #64
588	1:
589	strd A_l, A_h, [dst, #8]
590	ldr A_l, [src, #4]
591	ldr A_h, [src, #8]
592	strd B_l, B_h, [dst, #16]
593	ldr B_l, [src, #12]
594	ldr B_h, [src, #16]
595	strd C_l, C_h, [dst, #24]
596	ldr C_l, [src, #20]
597	ldr C_h, [src, #24]
598	strd D_l, D_h, [dst, #32]
599	ldr D_l, [src, #28]
600	ldr D_h, [src, #32]
601	bcs 2b
602
603	/* Save the remaining bytes and restore the callee-saved regs. */
604	strd A_l, A_h, [dst, #40]
605	add src, src, #36
606	strd B_l, B_h, [dst, #48]
607	ldrd B_l, B_h, [sp, #8]
608	strd C_l, C_h, [dst, #56]
609	ldrd C_l, C_h, [sp, #16]
610	strd D_l, D_h, [dst, #64]
611	ldrd D_l, D_h, [sp, #24]
612	add dst, dst, #72
613	ands count, tmp2, #0x3f
614	#endif
615	ldr tmp2, [sp], #FRAME_SIZE
616	bne .Ltail63unaligned
617	bx lr
618
619	.size memcpy, . - memcpy

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format