Context Navigation

memcpy-armv7a.S

Last change on this file was 444, checked in by satin@…, 6 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 15.2 KB

Rev	Line
[444]	1	/* Copyright (c) 2013, Linaro Limited
	2	All rights reserved.
	3
	4	Redistribution and use in source and binary forms, with or without
	5	modification, are permitted provided that the following conditions
	6	are met:
	7
	8	* Redistributions of source code must retain the above copyright
	9	notice, this list of conditions and the following disclaimer.
	10
	11	* Redistributions in binary form must reproduce the above copyright
	12	notice, this list of conditions and the following disclaimer in the
	13	documentation and/or other materials provided with the distribution.
	14
	15	* Neither the name of Linaro Limited nor the names of its
	16	contributors may be used to endorse or promote products derived
	17	from this software without specific prior written permission.
	18
	19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	20	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	21	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	22	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	23	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	24	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	25	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	26	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	27	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	28	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	29	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	30
	31	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
	32	of VFP or NEON when built with the appropriate flags.
	33
	34	Assumptions:
	35
	36	ARMv6 (ARMv7-a if using Neon)
	37	ARM state
	38	Unaligned accesses
	39	LDRD/STRD support unaligned word accesses
	40
	41	If compiled with GCC, this file should be enclosed within following
	42	pre-processing check:
	43	if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED)
	44
	45	*/
	46	.syntax unified
	47	/* This implementation requires ARM state. */
	48	.arm
	49
	50	#ifdef __ARM_NEON__
	51
	52	.fpu neon
	53	.arch armv7-a
	54	# define FRAME_SIZE 4
	55	# define USE_VFP
	56	# define USE_NEON
	57
	58	#elif !defined (__SOFTFP__)
	59
	60	.arch armv6
	61	.fpu vfpv2
	62	# define FRAME_SIZE 32
	63	# define USE_VFP
	64
	65	#else
	66	.arch armv6
	67	# define FRAME_SIZE 32
	68
	69	#endif
	70
	71	/* Old versions of GAS incorrectly implement the NEON align semantics. */
	72	#ifdef BROKEN_ASM_NEON_ALIGN
	73	#define ALIGN(addr, align) addr,:align
	74	#else
	75	#define ALIGN(addr, align) addr:align
	76	#endif
	77
	78	#define PC_OFFSET 8 /* PC pipeline compensation. */
	79	#define INSN_SIZE 4
	80
	81	/* Call parameters. */
	82	#define dstin r0
	83	#define src r1
	84	#define count r2
	85
	86	/* Locals. */
	87	#define tmp1 r3
	88	#define dst ip
	89	#define tmp2 r10
	90
	91	#ifndef USE_NEON
	92	/* For bulk copies using GP registers. */
	93	#define A_l r2 /* Call-clobbered. */
	94	#define A_h r3 /* Call-clobbered. */
	95	#define B_l r4
	96	#define B_h r5
	97	#define C_l r6
	98	#define C_h r7
	99	#define D_l r8
	100	#define D_h r9
	101	#endif
	102
	103	/* Number of lines ahead to pre-fetch data. If you change this the code
	104	below will need adjustment to compensate. */
	105
	106	#define prefetch_lines 5
	107
	108	#ifdef USE_VFP
	109	.macro cpy_line_vfp vreg, base
	110	vstr \vreg, [dst, #\base]
	111	vldr \vreg, [src, #\base]
	112	vstr d0, [dst, #\base + 8]
	113	vldr d0, [src, #\base + 8]
	114	vstr d1, [dst, #\base + 16]
	115	vldr d1, [src, #\base + 16]
	116	vstr d2, [dst, #\base + 24]
	117	vldr d2, [src, #\base + 24]
	118	vstr \vreg, [dst, #\base + 32]
	119	vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
	120	vstr d0, [dst, #\base + 40]
	121	vldr d0, [src, #\base + 40]
	122	vstr d1, [dst, #\base + 48]
	123	vldr d1, [src, #\base + 48]
	124	vstr d2, [dst, #\base + 56]
	125	vldr d2, [src, #\base + 56]
	126	.endm
	127
	128	.macro cpy_tail_vfp vreg, base
	129	vstr \vreg, [dst, #\base]
	130	vldr \vreg, [src, #\base]
	131	vstr d0, [dst, #\base + 8]
	132	vldr d0, [src, #\base + 8]
	133	vstr d1, [dst, #\base + 16]
	134	vldr d1, [src, #\base + 16]
	135	vstr d2, [dst, #\base + 24]
	136	vldr d2, [src, #\base + 24]
	137	vstr \vreg, [dst, #\base + 32]
	138	vstr d0, [dst, #\base + 40]
	139	vldr d0, [src, #\base + 40]
	140	vstr d1, [dst, #\base + 48]
	141	vldr d1, [src, #\base + 48]
	142	vstr d2, [dst, #\base + 56]
	143	vldr d2, [src, #\base + 56]
	144	.endm
	145	#endif
	146
	147	.macro def_fn f p2align=0
	148	.text
	149	.p2align \p2align
	150	.global \f
	151	.type \f, %function
	152	\f:
	153	.endm
	154
	155	def_fn memcpy p2align=6
	156
	157	mov dst, dstin /* Preserve dstin, we need to return it. */
	158	cmp count, #64
	159	bge .Lcpy_not_short
	160	/* Deal with small copies quickly by dropping straight into the
	161	exit block. */
	162
	163	.Ltail63unaligned:
	164	#ifdef USE_NEON
	165	and tmp1, count, #0x38
	166	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	167	add pc, pc, tmp1
	168	vld1.8 {d0}, [src]! /* 14 words to go. */
	169	vst1.8 {d0}, [dst]!
	170	vld1.8 {d0}, [src]! /* 12 words to go. */
	171	vst1.8 {d0}, [dst]!
	172	vld1.8 {d0}, [src]! /* 10 words to go. */
	173	vst1.8 {d0}, [dst]!
	174	vld1.8 {d0}, [src]! /* 8 words to go. */
	175	vst1.8 {d0}, [dst]!
	176	vld1.8 {d0}, [src]! /* 6 words to go. */
	177	vst1.8 {d0}, [dst]!
	178	vld1.8 {d0}, [src]! /* 4 words to go. */
	179	vst1.8 {d0}, [dst]!
	180	vld1.8 {d0}, [src]! /* 2 words to go. */
	181	vst1.8 {d0}, [dst]!
	182
	183	tst count, #4
	184	ldrne tmp1, [src], #4
	185	strne tmp1, [dst], #4
	186	#else
	187	/* Copy up to 15 full words of data. May not be aligned. */
	188	/* Cannot use VFP for unaligned data. */
	189	and tmp1, count, #0x3c
	190	add dst, dst, tmp1
	191	add src, src, tmp1
	192	rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
	193	/* Jump directly into the sequence below at the correct offset. */
	194	add pc, pc, tmp1, lsl #1
	195
	196	ldr tmp1, [src, #-60] /* 15 words to go. */
	197	str tmp1, [dst, #-60]
	198
	199	ldr tmp1, [src, #-56] /* 14 words to go. */
	200	str tmp1, [dst, #-56]
	201	ldr tmp1, [src, #-52]
	202	str tmp1, [dst, #-52]
	203
	204	ldr tmp1, [src, #-48] /* 12 words to go. */
	205	str tmp1, [dst, #-48]
	206	ldr tmp1, [src, #-44]
	207	str tmp1, [dst, #-44]
	208
	209	ldr tmp1, [src, #-40] /* 10 words to go. */
	210	str tmp1, [dst, #-40]
	211	ldr tmp1, [src, #-36]
	212	str tmp1, [dst, #-36]
	213
	214	ldr tmp1, [src, #-32] /* 8 words to go. */
	215	str tmp1, [dst, #-32]
	216	ldr tmp1, [src, #-28]
	217	str tmp1, [dst, #-28]
	218
	219	ldr tmp1, [src, #-24] /* 6 words to go. */
	220	str tmp1, [dst, #-24]
	221	ldr tmp1, [src, #-20]
	222	str tmp1, [dst, #-20]
	223
	224	ldr tmp1, [src, #-16] /* 4 words to go. */
	225	str tmp1, [dst, #-16]
	226	ldr tmp1, [src, #-12]
	227	str tmp1, [dst, #-12]
	228
	229	ldr tmp1, [src, #-8] /* 2 words to go. */
	230	str tmp1, [dst, #-8]
	231	ldr tmp1, [src, #-4]
	232	str tmp1, [dst, #-4]
	233	#endif
	234
	235	lsls count, count, #31
	236	ldrhcs tmp1, [src], #2
	237	ldrbne src, [src] /* Src is dead, use as a scratch. */
	238	strhcs tmp1, [dst], #2
	239	strbne src, [dst]
	240	bx lr
	241
	242	.Lcpy_not_short:
	243	/* At least 64 bytes to copy, but don't know the alignment yet. */
	244	str tmp2, [sp, #-FRAME_SIZE]!
	245	and tmp2, src, #7
	246	and tmp1, dst, #7
	247	cmp tmp1, tmp2
	248	bne .Lcpy_notaligned
	249
	250	#ifdef USE_VFP
	251	/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
	252	that the FP pipeline is much better at streaming loads and
	253	stores. This is outside the critical loop. */
	254	vmov.f32 s0, s0
	255	#endif
	256
	257	/* SRC and DST have the same mutual 32-bit alignment, but we may
	258	still need to pre-copy some bytes to get to natural alignment.
	259	We bring DST into full 64-bit alignment. */
	260	lsls tmp2, dst, #29
	261	beq 1f
	262	rsbs tmp2, tmp2, #0
	263	sub count, count, tmp2, lsr #29
	264	ldrmi tmp1, [src], #4
	265	strmi tmp1, [dst], #4
	266	lsls tmp2, tmp2, #2
	267	ldrhcs tmp1, [src], #2
	268	ldrbne tmp2, [src], #1
	269	strhcs tmp1, [dst], #2
	270	strbne tmp2, [dst], #1
	271
	272	1:
	273	subs tmp2, count, #64 /* Use tmp2 for count. */
	274	blt .Ltail63aligned
	275
	276	cmp tmp2, #512
	277	bge .Lcpy_body_long
	278
	279	.Lcpy_body_medium: /* Count in tmp2. */
	280	#ifdef USE_VFP
	281	1:
	282	vldr d0, [src, #0]
	283	subs tmp2, tmp2, #64
	284	vldr d1, [src, #8]
	285	vstr d0, [dst, #0]
	286	vldr d0, [src, #16]
	287	vstr d1, [dst, #8]
	288	vldr d1, [src, #24]
	289	vstr d0, [dst, #16]
	290	vldr d0, [src, #32]
	291	vstr d1, [dst, #24]
	292	vldr d1, [src, #40]
	293	vstr d0, [dst, #32]
	294	vldr d0, [src, #48]
	295	vstr d1, [dst, #40]
	296	vldr d1, [src, #56]
	297	vstr d0, [dst, #48]
	298	add src, src, #64
	299	vstr d1, [dst, #56]
	300	add dst, dst, #64
	301	bge 1b
	302	tst tmp2, #0x3f
	303	beq .Ldone
	304
	305	.Ltail63aligned: /* Count in tmp2. */
	306	and tmp1, tmp2, #0x38
	307	add dst, dst, tmp1
	308	add src, src, tmp1
	309	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	310	add pc, pc, tmp1
	311
	312	vldr d0, [src, #-56] /* 14 words to go. */
	313	vstr d0, [dst, #-56]
	314	vldr d0, [src, #-48] /* 12 words to go. */
	315	vstr d0, [dst, #-48]
	316	vldr d0, [src, #-40] /* 10 words to go. */
	317	vstr d0, [dst, #-40]
	318	vldr d0, [src, #-32] /* 8 words to go. */
	319	vstr d0, [dst, #-32]
	320	vldr d0, [src, #-24] /* 6 words to go. */
	321	vstr d0, [dst, #-24]
	322	vldr d0, [src, #-16] /* 4 words to go. */
	323	vstr d0, [dst, #-16]
	324	vldr d0, [src, #-8] /* 2 words to go. */
	325	vstr d0, [dst, #-8]
	326	#else
	327	sub src, src, #8
	328	sub dst, dst, #8
	329	1:
	330	ldrd A_l, A_h, [src, #8]
	331	strd A_l, A_h, [dst, #8]
	332	ldrd A_l, A_h, [src, #16]
	333	strd A_l, A_h, [dst, #16]
	334	ldrd A_l, A_h, [src, #24]
	335	strd A_l, A_h, [dst, #24]
	336	ldrd A_l, A_h, [src, #32]
	337	strd A_l, A_h, [dst, #32]
	338	ldrd A_l, A_h, [src, #40]
	339	strd A_l, A_h, [dst, #40]
	340	ldrd A_l, A_h, [src, #48]
	341	strd A_l, A_h, [dst, #48]
	342	ldrd A_l, A_h, [src, #56]
	343	strd A_l, A_h, [dst, #56]
	344	ldrd A_l, A_h, [src, #64]!
	345	strd A_l, A_h, [dst, #64]!
	346	subs tmp2, tmp2, #64
	347	bge 1b
	348	tst tmp2, #0x3f
	349	bne 1f
	350	ldr tmp2,[sp], #FRAME_SIZE
	351	bx lr
	352	1:
	353	add src, src, #8
	354	add dst, dst, #8
	355
	356	.Ltail63aligned: /* Count in tmp2. */
	357	/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
	358	we know that the src and dest are 32-bit aligned so we can use
	359	LDRD/STRD to improve efficiency. */
	360	/* TMP2 is now negative, but we don't care about that. The bottom
	361	six bits still tell us how many bytes are left to copy. */
	362
	363	and tmp1, tmp2, #0x38
	364	add dst, dst, tmp1
	365	add src, src, tmp1
	366	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
	367	add pc, pc, tmp1
	368	ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
	369	strd A_l, A_h, [dst, #-56]
	370	ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
	371	strd A_l, A_h, [dst, #-48]
	372	ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
	373	strd A_l, A_h, [dst, #-40]
	374	ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
	375	strd A_l, A_h, [dst, #-32]
	376	ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
	377	strd A_l, A_h, [dst, #-24]
	378	ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
	379	strd A_l, A_h, [dst, #-16]
	380	ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
	381	strd A_l, A_h, [dst, #-8]
	382
	383	#endif
	384	tst tmp2, #4
	385	ldrne tmp1, [src], #4
	386	strne tmp1, [dst], #4
	387	lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
	388	ldrhcs tmp1, [src], #2
	389	ldrbne tmp2, [src]
	390	strhcs tmp1, [dst], #2
	391	strbne tmp2, [dst]
	392
	393	.Ldone:
	394	ldr tmp2, [sp], #FRAME_SIZE
	395	bx lr
	396
	397	.Lcpy_body_long: /* Count in tmp2. */
	398
	399	/* Long copy. We know that there's at least (prefetch_lines * 64)
	400	bytes to go. */
	401	#ifdef USE_VFP
	402	/* Don't use PLD. Instead, read some data in advance of the current
	403	copy position into a register. This should act like a PLD
	404	operation but we won't have to repeat the transfer. */
	405
	406	vldr d3, [src, #0]
	407	vldr d4, [src, #64]
	408	vldr d5, [src, #128]
	409	vldr d6, [src, #192]
	410	vldr d7, [src, #256]
	411
	412	vldr d0, [src, #8]
	413	vldr d1, [src, #16]
	414	vldr d2, [src, #24]
	415	add src, src, #32
	416
	417	subs tmp2, tmp2, #prefetch_lines * 64 * 2
	418	blt 2f
	419	1:
	420	cpy_line_vfp d3, 0
	421	cpy_line_vfp d4, 64
	422	cpy_line_vfp d5, 128
	423	add dst, dst, #3 * 64
	424	add src, src, #3 * 64
	425	cpy_line_vfp d6, 0
	426	cpy_line_vfp d7, 64
	427	add dst, dst, #2 * 64
	428	add src, src, #2 * 64
	429	subs tmp2, tmp2, #prefetch_lines * 64
	430	bge 1b
	431
	432	2:
	433	cpy_tail_vfp d3, 0
	434	cpy_tail_vfp d4, 64
	435	cpy_tail_vfp d5, 128
	436	add src, src, #3 * 64
	437	add dst, dst, #3 * 64
	438	cpy_tail_vfp d6, 0
	439	vstr d7, [dst, #64]
	440	vldr d7, [src, #64]
	441	vstr d0, [dst, #64 + 8]
	442	vldr d0, [src, #64 + 8]
	443	vstr d1, [dst, #64 + 16]
	444	vldr d1, [src, #64 + 16]
	445	vstr d2, [dst, #64 + 24]
	446	vldr d2, [src, #64 + 24]
	447	vstr d7, [dst, #64 + 32]
	448	add src, src, #96
	449	vstr d0, [dst, #64 + 40]
	450	vstr d1, [dst, #64 + 48]
	451	vstr d2, [dst, #64 + 56]
	452	add dst, dst, #128
	453	add tmp2, tmp2, #prefetch_lines * 64
	454	b .Lcpy_body_medium
	455	#else
	456	/* Long copy. Use an SMS style loop to maximize the I/O
	457	bandwidth of the core. We don't have enough spare registers
	458	to synthesise prefetching, so use PLD operations. */
	459	/* Pre-bias src and dst. */
	460	sub src, src, #8
	461	sub dst, dst, #8
	462	pld [src, #8]
	463	pld [src, #72]
	464	subs tmp2, tmp2, #64
	465	pld [src, #136]
	466	ldrd A_l, A_h, [src, #8]
	467	strd B_l, B_h, [sp, #8]
	468	ldrd B_l, B_h, [src, #16]
	469	strd C_l, C_h, [sp, #16]
	470	ldrd C_l, C_h, [src, #24]
	471	strd D_l, D_h, [sp, #24]
	472	pld [src, #200]
	473	ldrd D_l, D_h, [src, #32]!
	474	b 1f
	475	.p2align 6
	476	2:
	477	pld [src, #232]
	478	strd A_l, A_h, [dst, #40]
	479	ldrd A_l, A_h, [src, #40]
	480	strd B_l, B_h, [dst, #48]
	481	ldrd B_l, B_h, [src, #48]
	482	strd C_l, C_h, [dst, #56]
	483	ldrd C_l, C_h, [src, #56]
	484	strd D_l, D_h, [dst, #64]!
	485	ldrd D_l, D_h, [src, #64]!
	486	subs tmp2, tmp2, #64
	487	1:
	488	strd A_l, A_h, [dst, #8]
	489	ldrd A_l, A_h, [src, #8]
	490	strd B_l, B_h, [dst, #16]
	491	ldrd B_l, B_h, [src, #16]
	492	strd C_l, C_h, [dst, #24]
	493	ldrd C_l, C_h, [src, #24]
	494	strd D_l, D_h, [dst, #32]
	495	ldrd D_l, D_h, [src, #32]
	496	bcs 2b
	497	/* Save the remaining bytes and restore the callee-saved regs. */
	498	strd A_l, A_h, [dst, #40]
	499	add src, src, #40
	500	strd B_l, B_h, [dst, #48]
	501	ldrd B_l, B_h, [sp, #8]
	502	strd C_l, C_h, [dst, #56]
	503	ldrd C_l, C_h, [sp, #16]
	504	strd D_l, D_h, [dst, #64]
	505	ldrd D_l, D_h, [sp, #24]
	506	add dst, dst, #72
	507	tst tmp2, #0x3f
	508	bne .Ltail63aligned
	509	ldr tmp2, [sp], #FRAME_SIZE
	510	bx lr
	511	#endif
	512
	513	.Lcpy_notaligned:
	514	pld [src]
	515	pld [src, #64]
	516	/* There's at least 64 bytes to copy, but there is no mutual
	517	alignment. */
	518	/* Bring DST to 64-bit alignment. */
	519	lsls tmp2, dst, #29
	520	pld [src, #(2 * 64)]
	521	beq 1f
	522	rsbs tmp2, tmp2, #0
	523	sub count, count, tmp2, lsr #29
	524	ldrmi tmp1, [src], #4
	525	strmi tmp1, [dst], #4
	526	lsls tmp2, tmp2, #2
	527	ldrbne tmp1, [src], #1
	528	ldrhcs tmp2, [src], #2
	529	strbne tmp1, [dst], #1
	530	strhcs tmp2, [dst], #2
	531	1:
	532	pld [src, #(3 * 64)]
	533	subs count, count, #64
	534	ldrmi tmp2, [sp], #FRAME_SIZE
	535	bmi .Ltail63unaligned
	536	pld [src, #(4 * 64)]
	537
	538	#ifdef USE_NEON
	539	vld1.8 {d0-d3}, [src]!
	540	vld1.8 {d4-d7}, [src]!
	541	subs count, count, #64
	542	bmi 2f
	543	1:
	544	pld [src, #(4 * 64)]
	545	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
	546	vld1.8 {d0-d3}, [src]!
	547	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
	548	vld1.8 {d4-d7}, [src]!
	549	subs count, count, #64
	550	bpl 1b
	551	2:
	552	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
	553	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
	554	ands count, count, #0x3f
	555	#else
	556	/* Use an SMS style loop to maximize the I/O bandwidth. */
	557	sub src, src, #4
	558	sub dst, dst, #8
	559	subs tmp2, count, #64 /* Use tmp2 for count. */
	560	ldr A_l, [src, #4]
	561	ldr A_h, [src, #8]
	562	strd B_l, B_h, [sp, #8]
	563	ldr B_l, [src, #12]
	564	ldr B_h, [src, #16]
	565	strd C_l, C_h, [sp, #16]
	566	ldr C_l, [src, #20]
	567	ldr C_h, [src, #24]
	568	strd D_l, D_h, [sp, #24]
	569	ldr D_l, [src, #28]
	570	ldr D_h, [src, #32]!
	571	b 1f
	572	.p2align 6
	573	2:
	574	pld [src, #(5 * 64) - (32 - 4)]
	575	strd A_l, A_h, [dst, #40]
	576	ldr A_l, [src, #36]
	577	ldr A_h, [src, #40]
	578	strd B_l, B_h, [dst, #48]
	579	ldr B_l, [src, #44]
	580	ldr B_h, [src, #48]
	581	strd C_l, C_h, [dst, #56]
	582	ldr C_l, [src, #52]
	583	ldr C_h, [src, #56]
	584	strd D_l, D_h, [dst, #64]!
	585	ldr D_l, [src, #60]
	586	ldr D_h, [src, #64]!
	587	subs tmp2, tmp2, #64
	588	1:
	589	strd A_l, A_h, [dst, #8]
	590	ldr A_l, [src, #4]
	591	ldr A_h, [src, #8]
	592	strd B_l, B_h, [dst, #16]
	593	ldr B_l, [src, #12]
	594	ldr B_h, [src, #16]
	595	strd C_l, C_h, [dst, #24]
	596	ldr C_l, [src, #20]
	597	ldr C_h, [src, #24]
	598	strd D_l, D_h, [dst, #32]
	599	ldr D_l, [src, #28]
	600	ldr D_h, [src, #32]
	601	bcs 2b
	602
	603	/* Save the remaining bytes and restore the callee-saved regs. */
	604	strd A_l, A_h, [dst, #40]
	605	add src, src, #36
	606	strd B_l, B_h, [dst, #48]
	607	ldrd B_l, B_h, [sp, #8]
	608	strd C_l, C_h, [dst, #56]
	609	ldrd C_l, C_h, [sp, #16]
	610	strd D_l, D_h, [dst, #64]
	611	ldrd D_l, D_h, [sp, #24]
	612	add dst, dst, #72
	613	ands count, tmp2, #0x3f
	614	#endif
	615	ldr tmp2, [sp], #FRAME_SIZE
	616	bne .Ltail63unaligned
	617	bx lr
	618
	619	.size memcpy, . - memcpy

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format