Viewing file: avx512fp16-helper.h (6.26 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
/* This file is used for emulation of avx512fp16 runtime tests. To verify the correctness of _Float16 type calculation, the idea is convert _Float16 to float and do emulation using float instructions. _Float16 type should not be emulate or check by itself. */
#include "avx512f-helper.h" #ifndef AVX512FP16_HELPER_INCLUDED #define AVX512FP16_HELPER_INCLUDED
#ifdef DEBUG #include <string.h> #endif #include <math.h> #include <limits.h> #include <float.h>
/* Useful macros. */ #define NOINLINE __attribute__((noinline,noclone)) #define _ROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) #define _ROUND_CUR 8 #define AVX512F_MAX_ELEM 512 / 32
/* Structure for _Float16 emulation */ typedef union { __m512 zmm; __m512h zmmh; __m512i zmmi; __m512d zmmd; __m256 ymm[2]; __m256h ymmh[2]; __m256i ymmi[2]; __m256d ymmd[2]; __m128h xmmh[4]; __m128 xmm[4]; __m128i xmmi[4]; __m128d xmmd[4]; unsigned short u16[32]; unsigned int u32[16]; int i32[16]; long long s64[8]; unsigned long long u64[8]; double f64[8]; float f32[16]; _Float16 f16[32]; } V512;
/* Global variables. */ V512 src1, src2, src3, src3f; int n_errs = 0;
/* Helper function for packing/unpacking ph operands. */ void NOINLINE unpack_ph_2twops(V512 src, V512 *op1, V512 *op2) { V512 v1;
op1->zmm = _mm512_cvtph_ps(src.ymmi[0]); v1.ymm[0] = _mm512_extractf32x8_ps(src.zmm, 1); op2->zmm = _mm512_cvtph_ps(v1.ymmi[0]); }
V512 NOINLINE pack_twops_2ph(V512 op1, V512 op2) { V512 v1, v2, v3;
v1.ymmi[0] = _mm512_cvtps_ph(op1.zmm, _MM_FROUND_TO_NEAREST_INT); v2.ymmi[0] = _mm512_cvtps_ph(op2.zmm, _MM_FROUND_TO_NEAREST_INT);
v3.zmm = _mm512_insertf32x8(v1.zmm, v2.ymm[0], 1);
return v3; }
/* Helper function used for result debugging */ #ifdef DEBUG void NOINLINE display_ps(const void *p, const char *banner, int n_elems) { int i; V512 *v = (V512*)p;
if (banner) { printf("%s", banner); }
for (i = 15; i >= n_elems; i--) { printf(" --------"); if (i == 8) { printf("\n"); if (banner) { printf("%*s", (int)strlen(banner), ""); } } }
for (; i >= 0; i--) { printf(" %x", v->u32[i]); if (i == 8) { printf("\n"); if (banner) { printf("%*s", (int)strlen(banner), ""); } } } printf("\n"); } #endif
/* Functions/macros used for init/result checking. Only check components within AVX512F_LEN. */ #define TO_STRING(x) #x #define STRINGIFY(x) TO_STRING(x) #define NAME_OF(NAME) STRINGIFY(INTRINSIC (NAME))
#define CHECK_RESULT(res, exp, size, intrin) \ check_results ((void*)res, (void*)exp, size,\ NAME_OF(intrin))
#define CHECK_RESULT_MASK(res, exp, size, intrin) \ check_results_mask ((__mmask32)res, (__mmask32)exp, size,\ NAME_OF(intrin))
/* To evaluate whether result match _Float16 precision, only the last bit of real/emulate result could be different. */ void NOINLINE check_results(void *got, void *exp, int n_elems, char *banner) { int i; V512 *v1 = (V512*)got; V512 *v2 = (V512*)exp;
for (i = 0; i < n_elems; i++) { if (v1->u16[i] != v2->u16[i] && ((v1->u16[i] > (v2->u16[i] + 1)) || (v1->u16[i] < (v2->u16[i] - 1)))) {
#ifdef DEBUG printf("ERROR: %s failed at %d'th element: %x(%f) != %x(%f)\n", banner ? banner : "", i, v1->u16[i], *(float *)(&v1->u16[i]), v2->u16[i], *(float *)(&v2->u16[i])); display_ps(got, "got:", n_elems); display_ps(exp, "exp:", n_elems); #endif n_errs++; break; } } }
void NOINLINE check_results_mask(__mmask32 got, __mmask32 exp, int n_elems, char *banner) { if (got != exp) { #ifdef DEBUG printf("ERROR: %s failed : got mask %x != exp mask %x\n", banner ? banner : "", got, exp); #endif n_errs++; } }
/* Functions for src/dest initialization */ void NOINLINE init_src() { V512 v1, v2, v3, v4; int i;
for (i = 0; i < AVX512F_MAX_ELEM; i++) { v1.f32[i] = i + 1; v2.f32[i] = (i + 2) * 0.5f; v3.f32[i] = i * 1.5f; v4.f32[i] = i - 1.5f;
src3.u32[i] = (i + 1) * 10; }
for (i = 0; i < 8; i++) { src3f.f64[i] = (i + 1) * 7.5; }
src1 = pack_twops_2ph(v1, v2); src2 = pack_twops_2ph(v3, v4); }
void NOINLINE init_src_nanf() { V512 v1, v2, v3, v4; int i;
for (i = 0; i < 16; i++) { v1.f32[i] = i + 1 + 0.5; v2.f32[i] = i + 17 + 0.5; v3.f32[i] = i * 2 + 2 + 0.5; v4.f32[i] = i * 2 + 34 + 0.5;
src3.u32[i] = (i + 1) * 10; }
v1.f32[0] = __builtin_nanf(""); src1 = pack_twops_2ph(v1, v2); src2 = pack_twops_2ph(v3, v4); }
void NOINLINE init_dest(V512 * res, V512 * exp) { int i; V512 v1;
for (i = 0; i < AVX512F_MAX_ELEM; i++) { v1.f32[i] = 12 + 0.5f * i; } *res = *exp = pack_twops_2ph(v1, v1); }
#define EMULATE(NAME) EVAL(emulate_, NAME, AVX512F_LEN)
#endif /* AVX512FP16_HELPER_INCLUDED */
/* Macros for AVX512VL Testing. Include V512 component usage and mask type for emulation. */
#if AVX512F_LEN == 256 #undef HF #undef SF #undef SI #undef DF #undef H_HF #undef NET_MASK #undef NET_CMASK #undef MASK_VALUE #undef HALF_MASK #undef ZMASK_VALUE #define NET_MASK 0xffff #define NET_CMASK 0xff #define MASK_VALUE 0xcccc #define ZMASK_VALUE 0xfcc1 #define HALF_MASK 0xcc #define HF(x) x.ymmh[0] #define H_HF(x) x.xmmh[0] #define SF(x) x.ymm[0] #define DF(x) x.ymmd[0] #define SI(x) x.ymmi[0] #elif AVX512F_LEN == 128 #undef HF #undef SF #undef DF #undef SI #undef H_HF #undef NET_MASK #undef NET_CMASK #undef MASK_VALUE #undef ZMASK_VALUE #undef HALF_MASK #define NET_MASK 0xff #define NET_CMASK 0xff #define MASK_VALUE 0xcc #define HALF_MASK MASK_VALUE #define ZMASK_VALUE 0xc1 #define HF(x) x.xmmh[0] #define SF(x) x.xmm[0] #define DF(x) x.xmmd[0] #define SI(x) x.xmmi[0] #define H_HF(x) x.xmmh[0] #else #define NET_MASK 0xffffffff #define NET_CMASK 0xffff #define MASK_VALUE 0xcccccccc #define ZMASK_VALUE 0xfcc1fcc1 #define HALF_MASK 0xcccc #define HF(x) x.zmmh #define SF(x) x.zmm #define DF(x) x.zmmd #define SI(x) x.zmmi #define H_HF(x) x.ymmh[0] #endif
|