1 | /* Copyright 2003 SuperH Ltd. */ |
---|
2 | |
---|
3 | #include "asm.h" |
---|
4 | |
---|
5 | #ifdef __SH5__ |
---|
6 | #if __SHMEDIA__ |
---|
7 | |
---|
8 | #ifdef __LITTLE_ENDIAN__ |
---|
9 | #define ZPAD_MASK(src, dst) addi src, -1, dst |
---|
10 | #else |
---|
11 | #define ZPAD_MASK(src, dst) \ |
---|
12 | byterev src, dst; addi dst, -1, dst; byterev dst, dst |
---|
13 | #endif |
---|
14 | |
---|
15 | |
---|
16 | /* We assume that the destination is not in the first 16 bytes of memory. |
---|
17 | A typical linker script will put the text section first, and as |
---|
18 | this code is longer that 16 bytes, you have to get out of your way |
---|
19 | to put data there. */ |
---|
20 | ENTRY(strncpy) |
---|
21 | pt L_small, tr2 |
---|
22 | ldlo.q r3, 0, r0 |
---|
23 | shlli r3, 3, r19 |
---|
24 | mcmpeq.b r0, r63, r1 |
---|
25 | SHHI r1, r19, r7 |
---|
26 | add r2, r4, r20 |
---|
27 | addi r20, -8, r5 |
---|
28 | /* If the size is greater than 8, we know we can read beyond the first |
---|
29 | (possibly partial) quadword, and write out a full first and last |
---|
30 | (possibly unaligned and/or overlapping) quadword. */ |
---|
31 | bge/u r2, r5, tr2 // L_small |
---|
32 | pt L_found0, tr0 |
---|
33 | addi r2, 8, r22 |
---|
34 | bnei/u r7, 0, tr0 // L_found0 |
---|
35 | ori r3, -8, r38 |
---|
36 | pt L_end_early, tr1 |
---|
37 | sub r2, r38, r22 |
---|
38 | stlo.q r2, 0, r0 |
---|
39 | sthi.q r2, 7, r0 |
---|
40 | sub r3, r2, r6 |
---|
41 | ldx.q r22, r6, r0 |
---|
42 | /* Before each iteration, check that we can store in full the next quad we |
---|
43 | are about to fetch. */ |
---|
44 | addi r5, -8, r36 |
---|
45 | bgtu/u r22, r36, tr1 // L_end_early |
---|
46 | pt L_scan0, tr1 |
---|
47 | L_scan0: |
---|
48 | addi r22, 8, r22 |
---|
49 | mcmpeq.b r0, r63, r1 |
---|
50 | stlo.q r22, -8, r0 |
---|
51 | bnei/u r1, 0, tr0 // L_found0 |
---|
52 | sthi.q r22, -1, r0 |
---|
53 | ldx.q r22, r6, r0 |
---|
54 | bgeu/l r36, r22, tr1 // L_scan0 |
---|
55 | L_end: |
---|
56 | // At end; we might re-read a few bytes when we fetch the last quad. |
---|
57 | // branch mispredict, so load is ready now. |
---|
58 | mcmpeq.b r0, r63, r1 |
---|
59 | addi r22, 8, r22 |
---|
60 | bnei/u r1, 0, tr0 // L_found0 |
---|
61 | add r3, r4, r7 |
---|
62 | ldlo.q r7, -8, r1 |
---|
63 | ldhi.q r7, -1, r7 |
---|
64 | ptabs r18, tr0 |
---|
65 | stlo.q r22, -8, r0 |
---|
66 | or r1, r7, r1 |
---|
67 | mcmpeq.b r1, r63, r7 |
---|
68 | sthi.q r22, -1, r0 |
---|
69 | ZPAD_MASK (r7, r7) |
---|
70 | and r1, r7, r1 // mask out non-zero bytes after first zero byte |
---|
71 | stlo.q r20, -8, r1 |
---|
72 | sthi.q r20, -1, r1 |
---|
73 | blink tr0, r63 |
---|
74 | |
---|
75 | L_end_early: |
---|
76 | /* Check if we can store the current quad in full. */ |
---|
77 | pt L_end, tr1 |
---|
78 | add r3, r4, r7 |
---|
79 | bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short. |
---|
80 | /* If not, that means we can just proceed to process the last quad. |
---|
81 | Two pipeline stalls are unavoidable, as we don't have enough ILP. */ |
---|
82 | ldlo.q r7, -8, r1 |
---|
83 | ldhi.q r7, -1, r7 |
---|
84 | ptabs r18, tr0 |
---|
85 | or r1, r7, r1 |
---|
86 | mcmpeq.b r1, r63, r7 |
---|
87 | ZPAD_MASK (r7, r7) |
---|
88 | and r1, r7, r1 // mask out non-zero bytes after first zero byte |
---|
89 | stlo.q r20, -8, r1 |
---|
90 | sthi.q r20, -1, r1 |
---|
91 | blink tr0, r63 |
---|
92 | |
---|
93 | L_found0: |
---|
94 | // r0: string to store, not yet zero-padding normalized. |
---|
95 | // r1: result of mcmpeq.b r0, r63, r1. |
---|
96 | // r22: store address plus 8. I.e. address where zero padding beyond the |
---|
97 | // string in r0 goes. |
---|
98 | // r20: store end address. |
---|
99 | // r5: store end address minus 8. |
---|
100 | pt L_write0_multiquad, tr0 |
---|
101 | ZPAD_MASK (r1, r1) |
---|
102 | and r0, r1, r0 // mask out non-zero bytes after first zero byte |
---|
103 | stlo.q r22, -8, r0 |
---|
104 | sthi.q r22, -1, r0 |
---|
105 | andi r22, -8, r1 // Check if zeros to write fit in one quad word. |
---|
106 | bgtu/l r5, r1, tr0 // L_write0_multiquad |
---|
107 | ptabs r18, tr1 |
---|
108 | sub r20, r22, r1 |
---|
109 | shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is |
---|
110 | SHLO r0, r1, r0 // handled correctly. |
---|
111 | SHLO r0, r1, r0 |
---|
112 | sthi.q r20, -1, r0 |
---|
113 | blink tr1, r63 |
---|
114 | |
---|
115 | L_write0_multiquad: |
---|
116 | pt L_write0_loop, tr0 |
---|
117 | ptabs r18, tr1 |
---|
118 | stlo.q r22, 0, r63 |
---|
119 | sthi.q r20, -1, r63 |
---|
120 | addi r1, 8, r1 |
---|
121 | bgeu/l r5, r1, tr0 // L_write0_loop |
---|
122 | blink tr1, r63 |
---|
123 | |
---|
124 | L_write0_loop: |
---|
125 | st.q r1, 0 ,r63 |
---|
126 | addi r1, 8, r1 |
---|
127 | bgeu/l r5, r1, tr0 // L_write0_loop |
---|
128 | blink tr1, r63 |
---|
129 | |
---|
130 | L_small: |
---|
131 | // r0: string to store, not yet zero-padding normalized. |
---|
132 | // r1: result of mcmpeq.b r0, r63, r1. |
---|
133 | // r7: nonzero indicates relevant zero found r0. |
---|
134 | // r2: store address. |
---|
135 | // r3: read address. |
---|
136 | // r4: size, max 8 |
---|
137 | // r20: store end address. |
---|
138 | // r5: store end address minus 8. |
---|
139 | pt L_nohi, tr0 |
---|
140 | pt L_small_storelong, tr1 |
---|
141 | ptabs r18, tr2 |
---|
142 | sub r63, r4, r23 |
---|
143 | bnei/u r7, 0, tr0 // L_nohi |
---|
144 | ori r3, -8, r7 |
---|
145 | bge/l r23, r7, tr0 // L_nohi |
---|
146 | ldhi.q r3, 7, r1 |
---|
147 | or r0, r1, r0 |
---|
148 | mcmpeq.b r0, r63, r1 |
---|
149 | L_nohi: |
---|
150 | ZPAD_MASK (r1, r1) |
---|
151 | and r0, r1, r0 |
---|
152 | movi 4, r19 |
---|
153 | bge/u r4, r19, tr1 // L_small_storelong |
---|
154 | |
---|
155 | pt L_small_end, tr0 |
---|
156 | #ifndef __LITTLE_ENDIAN__ |
---|
157 | byterev r0, r0 |
---|
158 | #endif |
---|
159 | beqi/u r4, 0, tr0 // L_small_end |
---|
160 | st.b r2, 0, r0 |
---|
161 | beqi/u r4, 1, tr0 // L_small_end |
---|
162 | shlri r0, 8, r0 |
---|
163 | st.b r2, 1, r0 |
---|
164 | beqi/u r4, 2, tr0 // L_small_end |
---|
165 | shlri r0, 8, r0 |
---|
166 | st.b r2, 2, r0 |
---|
167 | L_small_end: |
---|
168 | blink tr2, r63 |
---|
169 | |
---|
170 | L_small_storelong: |
---|
171 | shlli r23, 3, r7 |
---|
172 | SHHI r0, r7, r1 |
---|
173 | #ifdef __LITTLE_ENDIAN__ |
---|
174 | shlri r1, 32, r1 |
---|
175 | #else |
---|
176 | shlri r0, 32, r0 |
---|
177 | #endif |
---|
178 | stlo.l r2, 0, r0 |
---|
179 | sthi.l r2, 3, r0 |
---|
180 | stlo.l r20, -4, r1 |
---|
181 | sthi.l r20, -1, r1 |
---|
182 | blink tr2, r63 |
---|
183 | |
---|
184 | #else /* SHcompact */ |
---|
185 | |
---|
186 | /* This code is optimized for size. Instruction selection is SH5 specific. |
---|
187 | SH4 should use a different version. */ |
---|
188 | ENTRY(strncpy) |
---|
189 | mov #0, r6 |
---|
190 | cmp/eq r4, r6 |
---|
191 | bt return |
---|
192 | mov r2, r5 |
---|
193 | add #-1, r5 |
---|
194 | add r5, r4 |
---|
195 | loop: |
---|
196 | bt/s found0 |
---|
197 | add #1, r5 |
---|
198 | mov.b @r3+, r1 |
---|
199 | found0: |
---|
200 | cmp/eq r5,r4 |
---|
201 | mov.b r1, @r5 |
---|
202 | bf/s loop |
---|
203 | cmp/eq r1, r6 |
---|
204 | return: |
---|
205 | rts |
---|
206 | nop |
---|
207 | |
---|
208 | #endif /* SHcompact */ |
---|
209 | #endif /* __SH5__ */ |
---|