| 1 | /* Copyright 2003 SuperH Ltd. */ |
|---|
| 2 | |
|---|
| 3 | #include "asm.h" |
|---|
| 4 | |
|---|
| 5 | #ifdef __SH5__ |
|---|
| 6 | #if __SHMEDIA__ |
|---|
| 7 | |
|---|
| 8 | #ifdef __LITTLE_ENDIAN__ |
|---|
| 9 | #define ZPAD_MASK(src, dst) addi src, -1, dst |
|---|
| 10 | #else |
|---|
| 11 | #define ZPAD_MASK(src, dst) \ |
|---|
| 12 | byterev src, dst; addi dst, -1, dst; byterev dst, dst |
|---|
| 13 | #endif |
|---|
| 14 | |
|---|
| 15 | |
|---|
| 16 | /* We assume that the destination is not in the first 16 bytes of memory. |
|---|
| 17 | A typical linker script will put the text section first, and as |
|---|
| 18 | this code is longer that 16 bytes, you have to get out of your way |
|---|
| 19 | to put data there. */ |
|---|
| 20 | ENTRY(strncpy) |
|---|
| 21 | pt L_small, tr2 |
|---|
| 22 | ldlo.q r3, 0, r0 |
|---|
| 23 | shlli r3, 3, r19 |
|---|
| 24 | mcmpeq.b r0, r63, r1 |
|---|
| 25 | SHHI r1, r19, r7 |
|---|
| 26 | add r2, r4, r20 |
|---|
| 27 | addi r20, -8, r5 |
|---|
| 28 | /* If the size is greater than 8, we know we can read beyond the first |
|---|
| 29 | (possibly partial) quadword, and write out a full first and last |
|---|
| 30 | (possibly unaligned and/or overlapping) quadword. */ |
|---|
| 31 | bge/u r2, r5, tr2 // L_small |
|---|
| 32 | pt L_found0, tr0 |
|---|
| 33 | addi r2, 8, r22 |
|---|
| 34 | bnei/u r7, 0, tr0 // L_found0 |
|---|
| 35 | ori r3, -8, r38 |
|---|
| 36 | pt L_end_early, tr1 |
|---|
| 37 | sub r2, r38, r22 |
|---|
| 38 | stlo.q r2, 0, r0 |
|---|
| 39 | sthi.q r2, 7, r0 |
|---|
| 40 | sub r3, r2, r6 |
|---|
| 41 | ldx.q r22, r6, r0 |
|---|
| 42 | /* Before each iteration, check that we can store in full the next quad we |
|---|
| 43 | are about to fetch. */ |
|---|
| 44 | addi r5, -8, r36 |
|---|
| 45 | bgtu/u r22, r36, tr1 // L_end_early |
|---|
| 46 | pt L_scan0, tr1 |
|---|
| 47 | L_scan0: |
|---|
| 48 | addi r22, 8, r22 |
|---|
| 49 | mcmpeq.b r0, r63, r1 |
|---|
| 50 | stlo.q r22, -8, r0 |
|---|
| 51 | bnei/u r1, 0, tr0 // L_found0 |
|---|
| 52 | sthi.q r22, -1, r0 |
|---|
| 53 | ldx.q r22, r6, r0 |
|---|
| 54 | bgeu/l r36, r22, tr1 // L_scan0 |
|---|
| 55 | L_end: |
|---|
| 56 | // At end; we might re-read a few bytes when we fetch the last quad. |
|---|
| 57 | // branch mispredict, so load is ready now. |
|---|
| 58 | mcmpeq.b r0, r63, r1 |
|---|
| 59 | addi r22, 8, r22 |
|---|
| 60 | bnei/u r1, 0, tr0 // L_found0 |
|---|
| 61 | add r3, r4, r7 |
|---|
| 62 | ldlo.q r7, -8, r1 |
|---|
| 63 | ldhi.q r7, -1, r7 |
|---|
| 64 | ptabs r18, tr0 |
|---|
| 65 | stlo.q r22, -8, r0 |
|---|
| 66 | or r1, r7, r1 |
|---|
| 67 | mcmpeq.b r1, r63, r7 |
|---|
| 68 | sthi.q r22, -1, r0 |
|---|
| 69 | ZPAD_MASK (r7, r7) |
|---|
| 70 | and r1, r7, r1 // mask out non-zero bytes after first zero byte |
|---|
| 71 | stlo.q r20, -8, r1 |
|---|
| 72 | sthi.q r20, -1, r1 |
|---|
| 73 | blink tr0, r63 |
|---|
| 74 | |
|---|
| 75 | L_end_early: |
|---|
| 76 | /* Check if we can store the current quad in full. */ |
|---|
| 77 | pt L_end, tr1 |
|---|
| 78 | add r3, r4, r7 |
|---|
| 79 | bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short. |
|---|
| 80 | /* If not, that means we can just proceed to process the last quad. |
|---|
| 81 | Two pipeline stalls are unavoidable, as we don't have enough ILP. */ |
|---|
| 82 | ldlo.q r7, -8, r1 |
|---|
| 83 | ldhi.q r7, -1, r7 |
|---|
| 84 | ptabs r18, tr0 |
|---|
| 85 | or r1, r7, r1 |
|---|
| 86 | mcmpeq.b r1, r63, r7 |
|---|
| 87 | ZPAD_MASK (r7, r7) |
|---|
| 88 | and r1, r7, r1 // mask out non-zero bytes after first zero byte |
|---|
| 89 | stlo.q r20, -8, r1 |
|---|
| 90 | sthi.q r20, -1, r1 |
|---|
| 91 | blink tr0, r63 |
|---|
| 92 | |
|---|
| 93 | L_found0: |
|---|
| 94 | // r0: string to store, not yet zero-padding normalized. |
|---|
| 95 | // r1: result of mcmpeq.b r0, r63, r1. |
|---|
| 96 | // r22: store address plus 8. I.e. address where zero padding beyond the |
|---|
| 97 | // string in r0 goes. |
|---|
| 98 | // r20: store end address. |
|---|
| 99 | // r5: store end address minus 8. |
|---|
| 100 | pt L_write0_multiquad, tr0 |
|---|
| 101 | ZPAD_MASK (r1, r1) |
|---|
| 102 | and r0, r1, r0 // mask out non-zero bytes after first zero byte |
|---|
| 103 | stlo.q r22, -8, r0 |
|---|
| 104 | sthi.q r22, -1, r0 |
|---|
| 105 | andi r22, -8, r1 // Check if zeros to write fit in one quad word. |
|---|
| 106 | bgtu/l r5, r1, tr0 // L_write0_multiquad |
|---|
| 107 | ptabs r18, tr1 |
|---|
| 108 | sub r20, r22, r1 |
|---|
| 109 | shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is |
|---|
| 110 | SHLO r0, r1, r0 // handled correctly. |
|---|
| 111 | SHLO r0, r1, r0 |
|---|
| 112 | sthi.q r20, -1, r0 |
|---|
| 113 | blink tr1, r63 |
|---|
| 114 | |
|---|
| 115 | L_write0_multiquad: |
|---|
| 116 | pt L_write0_loop, tr0 |
|---|
| 117 | ptabs r18, tr1 |
|---|
| 118 | stlo.q r22, 0, r63 |
|---|
| 119 | sthi.q r20, -1, r63 |
|---|
| 120 | addi r1, 8, r1 |
|---|
| 121 | bgeu/l r5, r1, tr0 // L_write0_loop |
|---|
| 122 | blink tr1, r63 |
|---|
| 123 | |
|---|
| 124 | L_write0_loop: |
|---|
| 125 | st.q r1, 0 ,r63 |
|---|
| 126 | addi r1, 8, r1 |
|---|
| 127 | bgeu/l r5, r1, tr0 // L_write0_loop |
|---|
| 128 | blink tr1, r63 |
|---|
| 129 | |
|---|
| 130 | L_small: |
|---|
| 131 | // r0: string to store, not yet zero-padding normalized. |
|---|
| 132 | // r1: result of mcmpeq.b r0, r63, r1. |
|---|
| 133 | // r7: nonzero indicates relevant zero found r0. |
|---|
| 134 | // r2: store address. |
|---|
| 135 | // r3: read address. |
|---|
| 136 | // r4: size, max 8 |
|---|
| 137 | // r20: store end address. |
|---|
| 138 | // r5: store end address minus 8. |
|---|
| 139 | pt L_nohi, tr0 |
|---|
| 140 | pt L_small_storelong, tr1 |
|---|
| 141 | ptabs r18, tr2 |
|---|
| 142 | sub r63, r4, r23 |
|---|
| 143 | bnei/u r7, 0, tr0 // L_nohi |
|---|
| 144 | ori r3, -8, r7 |
|---|
| 145 | bge/l r23, r7, tr0 // L_nohi |
|---|
| 146 | ldhi.q r3, 7, r1 |
|---|
| 147 | or r0, r1, r0 |
|---|
| 148 | mcmpeq.b r0, r63, r1 |
|---|
| 149 | L_nohi: |
|---|
| 150 | ZPAD_MASK (r1, r1) |
|---|
| 151 | and r0, r1, r0 |
|---|
| 152 | movi 4, r19 |
|---|
| 153 | bge/u r4, r19, tr1 // L_small_storelong |
|---|
| 154 | |
|---|
| 155 | pt L_small_end, tr0 |
|---|
| 156 | #ifndef __LITTLE_ENDIAN__ |
|---|
| 157 | byterev r0, r0 |
|---|
| 158 | #endif |
|---|
| 159 | beqi/u r4, 0, tr0 // L_small_end |
|---|
| 160 | st.b r2, 0, r0 |
|---|
| 161 | beqi/u r4, 1, tr0 // L_small_end |
|---|
| 162 | shlri r0, 8, r0 |
|---|
| 163 | st.b r2, 1, r0 |
|---|
| 164 | beqi/u r4, 2, tr0 // L_small_end |
|---|
| 165 | shlri r0, 8, r0 |
|---|
| 166 | st.b r2, 2, r0 |
|---|
| 167 | L_small_end: |
|---|
| 168 | blink tr2, r63 |
|---|
| 169 | |
|---|
| 170 | L_small_storelong: |
|---|
| 171 | shlli r23, 3, r7 |
|---|
| 172 | SHHI r0, r7, r1 |
|---|
| 173 | #ifdef __LITTLE_ENDIAN__ |
|---|
| 174 | shlri r1, 32, r1 |
|---|
| 175 | #else |
|---|
| 176 | shlri r0, 32, r0 |
|---|
| 177 | #endif |
|---|
| 178 | stlo.l r2, 0, r0 |
|---|
| 179 | sthi.l r2, 3, r0 |
|---|
| 180 | stlo.l r20, -4, r1 |
|---|
| 181 | sthi.l r20, -1, r1 |
|---|
| 182 | blink tr2, r63 |
|---|
| 183 | |
|---|
| 184 | #else /* SHcompact */ |
|---|
| 185 | |
|---|
| 186 | /* This code is optimized for size. Instruction selection is SH5 specific. |
|---|
| 187 | SH4 should use a different version. */ |
|---|
| 188 | ENTRY(strncpy) |
|---|
| 189 | mov #0, r6 |
|---|
| 190 | cmp/eq r4, r6 |
|---|
| 191 | bt return |
|---|
| 192 | mov r2, r5 |
|---|
| 193 | add #-1, r5 |
|---|
| 194 | add r5, r4 |
|---|
| 195 | loop: |
|---|
| 196 | bt/s found0 |
|---|
| 197 | add #1, r5 |
|---|
| 198 | mov.b @r3+, r1 |
|---|
| 199 | found0: |
|---|
| 200 | cmp/eq r5,r4 |
|---|
| 201 | mov.b r1, @r5 |
|---|
| 202 | bf/s loop |
|---|
| 203 | cmp/eq r1, r6 |
|---|
| 204 | return: |
|---|
| 205 | rts |
|---|
| 206 | nop |
|---|
| 207 | |
|---|
| 208 | #endif /* SHcompact */ |
|---|
| 209 | #endif /* __SH5__ */ |
|---|