Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

strncpy.S @ 444

Last change on this file since 444 was 444, checked in by satin@…, 7 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 4.8 KB

Line
1	/* Copyright 2003 SuperH Ltd. */
2
3	#include "asm.h"
4
5	#ifdef __SH5__
6	#if __SHMEDIA__
7
8	#ifdef __LITTLE_ENDIAN__
9	#define ZPAD_MASK(src, dst) addi src, -1, dst
10	#else
11	#define ZPAD_MASK(src, dst) \
12	byterev src, dst; addi dst, -1, dst; byterev dst, dst
13	#endif
14
15
16	/* We assume that the destination is not in the first 16 bytes of memory.
17	A typical linker script will put the text section first, and as
18	this code is longer that 16 bytes, you have to get out of your way
19	to put data there. */
20	ENTRY(strncpy)
21	pt L_small, tr2
22	ldlo.q r3, 0, r0
23	shlli r3, 3, r19
24	mcmpeq.b r0, r63, r1
25	SHHI r1, r19, r7
26	add r2, r4, r20
27	addi r20, -8, r5
28	/* If the size is greater than 8, we know we can read beyond the first
29	(possibly partial) quadword, and write out a full first and last
30	(possibly unaligned and/or overlapping) quadword. */
31	bge/u r2, r5, tr2 // L_small
32	pt L_found0, tr0
33	addi r2, 8, r22
34	bnei/u r7, 0, tr0 // L_found0
35	ori r3, -8, r38
36	pt L_end_early, tr1
37	sub r2, r38, r22
38	stlo.q r2, 0, r0
39	sthi.q r2, 7, r0
40	sub r3, r2, r6
41	ldx.q r22, r6, r0
42	/* Before each iteration, check that we can store in full the next quad we
43	are about to fetch. */
44	addi r5, -8, r36
45	bgtu/u r22, r36, tr1 // L_end_early
46	pt L_scan0, tr1
47	L_scan0:
48	addi r22, 8, r22
49	mcmpeq.b r0, r63, r1
50	stlo.q r22, -8, r0
51	bnei/u r1, 0, tr0 // L_found0
52	sthi.q r22, -1, r0
53	ldx.q r22, r6, r0
54	bgeu/l r36, r22, tr1 // L_scan0
55	L_end:
56	// At end; we might re-read a few bytes when we fetch the last quad.
57	// branch mispredict, so load is ready now.
58	mcmpeq.b r0, r63, r1
59	addi r22, 8, r22
60	bnei/u r1, 0, tr0 // L_found0
61	add r3, r4, r7
62	ldlo.q r7, -8, r1
63	ldhi.q r7, -1, r7
64	ptabs r18, tr0
65	stlo.q r22, -8, r0
66	or r1, r7, r1
67	mcmpeq.b r1, r63, r7
68	sthi.q r22, -1, r0
69	ZPAD_MASK (r7, r7)
70	and r1, r7, r1 // mask out non-zero bytes after first zero byte
71	stlo.q r20, -8, r1
72	sthi.q r20, -1, r1
73	blink tr0, r63
74
75	L_end_early:
76	/* Check if we can store the current quad in full. */
77	pt L_end, tr1
78	add r3, r4, r7
79	bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short.
80	/* If not, that means we can just proceed to process the last quad.
81	Two pipeline stalls are unavoidable, as we don't have enough ILP. */
82	ldlo.q r7, -8, r1
83	ldhi.q r7, -1, r7
84	ptabs r18, tr0
85	or r1, r7, r1
86	mcmpeq.b r1, r63, r7
87	ZPAD_MASK (r7, r7)
88	and r1, r7, r1 // mask out non-zero bytes after first zero byte
89	stlo.q r20, -8, r1
90	sthi.q r20, -1, r1
91	blink tr0, r63
92
93	L_found0:
94	// r0: string to store, not yet zero-padding normalized.
95	// r1: result of mcmpeq.b r0, r63, r1.
96	// r22: store address plus 8. I.e. address where zero padding beyond the
97	// string in r0 goes.
98	// r20: store end address.
99	// r5: store end address minus 8.
100	pt L_write0_multiquad, tr0
101	ZPAD_MASK (r1, r1)
102	and r0, r1, r0 // mask out non-zero bytes after first zero byte
103	stlo.q r22, -8, r0
104	sthi.q r22, -1, r0
105	andi r22, -8, r1 // Check if zeros to write fit in one quad word.
106	bgtu/l r5, r1, tr0 // L_write0_multiquad
107	ptabs r18, tr1
108	sub r20, r22, r1
109	shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is
110	SHLO r0, r1, r0 // handled correctly.
111	SHLO r0, r1, r0
112	sthi.q r20, -1, r0
113	blink tr1, r63
114
115	L_write0_multiquad:
116	pt L_write0_loop, tr0
117	ptabs r18, tr1
118	stlo.q r22, 0, r63
119	sthi.q r20, -1, r63
120	addi r1, 8, r1
121	bgeu/l r5, r1, tr0 // L_write0_loop
122	blink tr1, r63
123
124	L_write0_loop:
125	st.q r1, 0 ,r63
126	addi r1, 8, r1
127	bgeu/l r5, r1, tr0 // L_write0_loop
128	blink tr1, r63
129
130	L_small:
131	// r0: string to store, not yet zero-padding normalized.
132	// r1: result of mcmpeq.b r0, r63, r1.
133	// r7: nonzero indicates relevant zero found r0.
134	// r2: store address.
135	// r3: read address.
136	// r4: size, max 8
137	// r20: store end address.
138	// r5: store end address minus 8.
139	pt L_nohi, tr0
140	pt L_small_storelong, tr1
141	ptabs r18, tr2
142	sub r63, r4, r23
143	bnei/u r7, 0, tr0 // L_nohi
144	ori r3, -8, r7
145	bge/l r23, r7, tr0 // L_nohi
146	ldhi.q r3, 7, r1
147	or r0, r1, r0
148	mcmpeq.b r0, r63, r1
149	L_nohi:
150	ZPAD_MASK (r1, r1)
151	and r0, r1, r0
152	movi 4, r19
153	bge/u r4, r19, tr1 // L_small_storelong
154
155	pt L_small_end, tr0
156	#ifndef __LITTLE_ENDIAN__
157	byterev r0, r0
158	#endif
159	beqi/u r4, 0, tr0 // L_small_end
160	st.b r2, 0, r0
161	beqi/u r4, 1, tr0 // L_small_end
162	shlri r0, 8, r0
163	st.b r2, 1, r0
164	beqi/u r4, 2, tr0 // L_small_end
165	shlri r0, 8, r0
166	st.b r2, 2, r0
167	L_small_end:
168	blink tr2, r63
169
170	L_small_storelong:
171	shlli r23, 3, r7
172	SHHI r0, r7, r1
173	#ifdef __LITTLE_ENDIAN__
174	shlri r1, 32, r1
175	#else
176	shlri r0, 32, r0
177	#endif
178	stlo.l r2, 0, r0
179	sthi.l r2, 3, r0
180	stlo.l r20, -4, r1
181	sthi.l r20, -1, r1
182	blink tr2, r63
183
184	#else /* SHcompact */
185
186	/* This code is optimized for size. Instruction selection is SH5 specific.
187	SH4 should use a different version. */
188	ENTRY(strncpy)
189	mov #0, r6
190	cmp/eq r4, r6
191	bt return
192	mov r2, r5
193	add #-1, r5
194	add r5, r4
195	loop:
196	bt/s found0
197	add #1, r5
198	mov.b @r3+, r1
199	found0:
200	cmp/eq r5,r4
201	mov.b r1, @r5
202	bf/s loop
203	cmp/eq r1, r6
204	return:
205	rts
206	nop
207
208	#endif /* SHcompact */
209	#endif /* __SH5__ */

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format