+/**
+ * Fill memory region with zero (where length is a compile-time constant)
+ *
+ * @v dest Destination address
+ * @v len Length
+ * @ret dest Destination address
+ */
+static inline __attribute__ (( always_inline )) void *
+__constant_memset_zero ( void *dest, size_t len ) {
+ union {
+ uint32_t u32[2];
+ uint16_t u16[4];
+ uint8_t u8[8];
+ } __attribute__ (( __may_alias__ )) *dest_u = dest;
+ void *edi;
+ uint32_t eax;
+
+ switch ( len ) {
+ case 0 : /* 0 bytes */
+ return dest;
+
+ /* Single-register moves. Almost certainly better than a
+ * string operation. We can avoid clobbering any registers,
+ * we can reuse a zero that happens to already be in a
+ * register, and we can optimise away the code entirely if the
+ * memset() is used to clear a region which then gets
+ * immediately overwritten.
+ */
+ case 1 : /* 3 bytes */
+ dest_u->u8[0] = 0;
+ return dest;
+ case 2: /* 5 bytes */
+ dest_u->u16[0] = 0;
+ return dest;
+ case 4: /* 6 bytes */
+ dest_u->u32[0] = 0;
+ return dest;
+
+ /* Double-register moves. Very probably better than a string
+ * operation.
+ */
+ case 3 : /* 9 bytes */
+ dest_u->u16[0] = 0;
+ dest_u->u8[2] = 0;
+ return dest;
+ case 5 : /* 10 bytes */
+ dest_u->u32[0] = 0;
+ dest_u->u8[4] = 0;
+ return dest;
+ case 6 : /* 12 bytes */
+ dest_u->u32[0] = 0;
+ dest_u->u16[2] = 0;
+ return dest;
+ case 8 : /* 13 bytes */
+ dest_u->u32[0] = 0;
+ dest_u->u32[1] = 0;
+ return dest;
+ }
+
+ /* As with memcpy(), we can potentially save space by using
+ * multiple single-byte "stos" instructions instead of loading
+ * up ecx and using "rep stosb".
+ *
+ * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
+ * to allow for saving/restoring ecx 50% of the time.
+ *
+ * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
+ *
+ * The calculations are therefore the same as for memcpy(),
+ * giving a cutoff point of around 26 bytes.
+ */