Viewing file: spill_to_mask-1.c (3.22 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
/* { dg-do compile } */ /* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
typedef unsigned long long u64; typedef unsigned int u32; typedef unsigned short u16; typedef unsigned char u8;
#ifdef DTYPE32 typedef u32 DTYPE; #define byteswap byteswapu32 #endif
#ifdef DTYPE16 typedef u16 DTYPE; #define byteswap byteswapu16 #endif
#ifdef DTYPE8 typedef u16 DTYPE; #define byteswap byteswapu8 #endif
#ifdef DTYPE64 typedef u16 DTYPE; #define byteswap byteswapu64 #endif
#define R(x,n) ( (x >> n) | (x << (32 - n)))
#define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22)) #define S1(x) (R(x, 6) ^ R(x,11) ^ R(x,25))
#define TT(a,b,c,d,e,f,g,h,x,K) \ { \ tmp1 = h + S1(e) + (g ^ (e & (f ^ g))) + K + x; \ tmp2 = S0(a) + ((a & b) | (c & (a | b))); \ h = tmp1 + tmp2; \ d += tmp1; \ }
static inline u32 byteswapu32(u32 x) { x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16; x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8; return x; }
static inline u16 byteswapu16(u16 x) { x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8; return x; }
static inline u8 byteswapu8(u8 x) { return x; }
static inline u64 byteswapu64(u64 x) { x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x & 0xFFFFFFFF00000000) >> 32); return x; }
void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16]) { DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h; DTYPE w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; w0 = byteswap(in[0]); w1 = byteswap(in[1]); w2 = byteswap(in[2]); w3 = byteswap(in[3]); w4 = byteswap(in[4]); w5 = byteswap(in[5]); w6 = byteswap(in[6]); w7 = byteswap(in[7]); w8 = byteswap(in[8]); w9 = byteswap(in[9]); w10 = byteswap(in[10]); w11 = byteswap(in[11]); w12 = byteswap(in[12]); w13 = byteswap(in[13]); w14 = byteswap(in[14]); w15 = byteswap(in[15]); a = out[0]; b = out[1]; c = out[2]; d = out[3]; e = out[4]; f = out[5]; g = out[6]; h = out[7];
TT(a, b, c, d, e, f, g, h, w0, C[0]); TT(h, a, b, c, d, e, f, g, w1, C[1]); TT(g, h, a, b, c, d, e, f, w2, C[2]); TT(f, g, h, a, b, c, d, e, w3, C[3]); TT(e, f, g, h, a, b, c, d, w4, C[4]); TT(d, e, f, g, h, a, b, c, w5, C[5]); TT(c, d, e, f, g, h, a, b, w6, C[6]); TT(b, c, d, e, f, g, h, a, w7, C[7]); TT(a, b, c, d, e, f, g, h, w8, C[8]); TT(h, a, b, c, d, e, f, g, w9, C[9]); TT(g, h, a, b, c, d, e, f, w10, C[10]); TT(f, g, h, a, b, c, d, e, w11, C[11]); TT(e, f, g, h, a, b, c, d, w12, C[12]); TT(d, e, f, g, h, a, b, c, w13, C[13]); TT(c, d, e, f, g, h, a, b, w14, C[14]); TT(b, c, d, e, f, g, h, a, w15, C[15]);
out[0] += a; out[1] += b; out[2] += c; out[3] += d; out[4] += e; out[5] += f; out[6] += g; out[7] += h; }
/* { dg-final { scan-assembler "kmovd" } } */ /* { dg-final { scan-assembler-not "knot" } } */ /* { dg-final { scan-assembler-not "kxor" } } */ /* { dg-final { scan-assembler-not "kor" } } */ /* { dg-final { scan-assembler-not "kandn" } } */
|