1 #ifndef X86_BITS_STRING_H
2 #define X86_BITS_STRING_H
5 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or any later version.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 * You can also choose to distribute this program under the terms of
23 * the Unmodified Binary Distribution Licence (as given in the file
24 * COPYING.UBDL), provided that you have satisfied its requirements.
27 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
31 * Optimised string operations
35 extern void * __memcpy ( void *dest, const void *src, size_t len );
36 extern void * __memcpy_reverse ( void *dest, const void *src, size_t len );
39 * Copy memory area (where length is a compile-time constant)
41 * @v dest Destination address
42 * @v src Source address
44 * @ret dest Destination address
46 static inline __attribute__ (( always_inline )) void *
47 __constant_memcpy ( void *dest, const void *src, size_t len ) {
52 } __attribute__ (( __may_alias__ )) *dest_u = dest;
57 } __attribute__ (( __may_alias__ )) *src_u = src;
62 case 0 : /* 0 bytes */
65 * Single-register moves; these are always better than a
66 * string operation. We can clobber an arbitrary two
67 * registers (data, source, dest can re-use source register)
68 * instead of being restricted to esi and edi. There's also a
69 * much greater potential for optimising with nearby code.
72 case 1 : /* 4 bytes */
73 dest_u->u8[0] = src_u->u8[0];
75 case 2 : /* 6 bytes */
76 dest_u->u16[0] = src_u->u16[0];
78 case 4 : /* 4 bytes */
79 dest_u->u32[0] = src_u->u32[0];
82 * Double-register moves; these are probably still a win.
85 case 3 : /* 12 bytes */
86 dest_u->u16[0] = src_u->u16[0];
87 dest_u->u8[2] = src_u->u8[2];
89 case 5 : /* 10 bytes */
90 dest_u->u32[0] = src_u->u32[0];
91 dest_u->u8[4] = src_u->u8[4];
93 case 6 : /* 12 bytes */
94 dest_u->u32[0] = src_u->u32[0];
95 dest_u->u16[2] = src_u->u16[2];
97 case 8 : /* 10 bytes */
98 dest_u->u32[0] = src_u->u32[0];
99 dest_u->u32[1] = src_u->u32[1];
103 /* Even if we have to load up esi and edi ready for a string
104 * operation, we can sometimes save space by using multiple
105 * single-byte "movs" operations instead of loading up ecx and
108 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
109 * to allow for saving/restoring ecx 50% of the time.
111 * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes.
112 * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte,
113 * but "movsl" moves twice as much data, so it balances out).
115 * The cutoff point therefore occurs around 26 bytes; the byte
116 * requirements for each method are:
118 * len 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
119 * #bytes (ecx) 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
120 * #bytes (no ecx) 4 5 6 7 5 6 7 8 6 7 8 9 7 8 9 10
127 return __memcpy ( dest, src, len );
130 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
131 : "0" ( edi ), "1" ( esi ) : "memory" );
133 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
134 : "0" ( edi ), "1" ( esi ) : "memory" );
136 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
137 : "0" ( edi ), "1" ( esi ) : "memory" );
139 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
140 : "0" ( edi ), "1" ( esi ) : "memory" );
142 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
143 : "0" ( edi ), "1" ( esi ) : "memory" );
145 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
146 : "0" ( edi ), "1" ( esi ) : "memory" );
147 if ( ( len % 4 ) >= 2 )
148 __asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi )
149 : "0" ( edi ), "1" ( esi ) : "memory" );
150 if ( ( len % 2 ) >= 1 )
151 __asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi )
152 : "0" ( edi ), "1" ( esi ) : "memory" );
160 * @v dest Destination address
161 * @v src Source address
163 * @ret dest Destination address
165 static inline __attribute__ (( always_inline )) void *
166 memcpy ( void *dest, const void *src, size_t len ) {
167 if ( __builtin_constant_p ( len ) ) {
168 return __constant_memcpy ( dest, src, len );
170 return __memcpy ( dest, src, len );
174 extern void * __memmove ( void *dest, const void *src, size_t len );
177 * Copy (possibly overlapping) memory area
179 * @v dest Destination address
180 * @v src Source address
182 * @ret dest Destination address
184 static inline __attribute__ (( always_inline )) void *
185 memmove ( void *dest, const void *src, size_t len ) {
186 ssize_t offset = ( dest - src );
188 if ( __builtin_constant_p ( offset ) ) {
190 return memcpy ( dest, src, len );
192 return __memcpy_reverse ( dest, src, len );
195 return __memmove ( dest, src, len );
202 * @v dest Destination address
203 * @v fill Fill pattern
205 * @ret dest Destination address
207 static inline __attribute__ (( always_inline )) void *
208 __memset ( void *dest, int fill, size_t len ) {
212 __asm__ __volatile__ ( "rep stosb"
213 : "=&D" ( discard_D ), "=&c" ( discard_c )
214 : "0" ( dest ), "1" ( len ), "a" ( fill )
220 * Fill memory region with zero (where length is a compile-time constant)
222 * @v dest Destination address
224 * @ret dest Destination address
226 static inline __attribute__ (( always_inline )) void *
227 __constant_memset_zero ( void *dest, size_t len ) {
232 } __attribute__ (( __may_alias__ )) *dest_u = dest;
237 case 0 : /* 0 bytes */
240 /* Single-register moves. Almost certainly better than a
241 * string operation. We can avoid clobbering any registers,
242 * we can reuse a zero that happens to already be in a
243 * register, and we can optimise away the code entirely if the
244 * memset() is used to clear a region which then gets
245 * immediately overwritten.
247 case 1 : /* 3 bytes */
250 case 2: /* 5 bytes */
253 case 4: /* 6 bytes */
257 /* Double-register moves. Very probably better than a string
260 case 3 : /* 9 bytes */
264 case 5 : /* 10 bytes */
268 case 6 : /* 12 bytes */
272 case 8 : /* 13 bytes */
278 /* As with memcpy(), we can potentially save space by using
279 * multiple single-byte "stos" instructions instead of loading
280 * up ecx and using "rep stosb".
282 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
283 * to allow for saving/restoring ecx 50% of the time.
285 * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
287 * The calculations are therefore the same as for memcpy(),
288 * giving a cutoff point of around 26 bytes.
295 return __memset ( dest, 0, len );
298 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
299 : "0" ( edi ), "1" ( eax ) : "memory" );
301 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
302 : "0" ( edi ), "1" ( eax ) : "memory" );
304 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
305 : "0" ( edi ), "1" ( eax ) : "memory" );
307 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
308 : "0" ( edi ), "1" ( eax ) : "memory" );
310 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
311 : "0" ( edi ), "1" ( eax ) : "memory" );
313 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
314 : "0" ( edi ), "1" ( eax ) : "memory" );
315 if ( ( len % 4 ) >= 2 )
316 __asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
317 : "0" ( edi ), "1" ( eax ) : "memory" );
318 if ( ( len % 2 ) >= 1 )
319 __asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
320 : "0" ( edi ), "1" ( eax ) : "memory" );
328 * @v dest Destination address
329 * @v fill Fill pattern
331 * @ret dest Destination address
333 static inline __attribute__ (( always_inline )) void *
334 memset ( void *dest, int fill, size_t len ) {
336 if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
337 __builtin_constant_p ( len ) ) {
338 return __constant_memset_zero ( dest, len );
340 return __memset ( dest, fill, len );
344 #endif /* X86_BITS_STRING_H */